Skip to content

Commit 6f5189b

Browse files
committed
Enable mix video and image input.
Signed-off-by: xipingya <[email protected]>
1 parent a2ad61b commit 6f5189b

File tree

16 files changed

+91
-90
lines changed

16 files changed

+91
-90
lines changed

src/cpp/src/continuous_batching/pipeline_base.cpp

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,6 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
167167

168168
OPENVINO_ASSERT(prompts.size() == sampling_params.size(), "Number of prompts should be equal to the number of generation configs.");
169169
OPENVINO_ASSERT(prompts.size() == rgbs_vector.size() || prompts.size() == video_vector.size(), "Number of prompts should be equal to the number of images/video vectors.");
170-
OPENVINO_ASSERT(rgbs_vector.size() == 0u || video_vector.size() == 0u, "Only support one input, video or images");
171170

172171
std::vector<ov::Tensor> input_embeds_list;
173172
std::vector<ov::Tensor> token_type_ids_list;
@@ -177,11 +176,12 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
177176

178177
if (m_is_chat_conversation) {
179178
OPENVINO_ASSERT(1 == prompts.size(), "Can't chat with multiple prompts");
180-
const auto& rgbs = video_vector.empty() ? rgbs_vector[0] : video_vector[0];
181179
const auto& prompt = prompts[0];
182180
auto start_get_inputs_embeds = std::chrono::steady_clock::now();
183181

184-
encoded_images = m_inputs_embedder->encode_images(rgbs, rgbs_vector.empty());
182+
auto image_rgbs = rgbs_vector.size() > 0 ? rgbs_vector[0] : std::vector<ov::Tensor>{};
183+
auto video_rgbs = video_vector.size() > 0 ? video_vector[0] : std::vector<ov::Tensor>{};
184+
encoded_images = m_inputs_embedder->encode_images(image_rgbs, video_rgbs);
185185
m_history_images.insert(m_history_images.end(), encoded_images.begin(), encoded_images.end());
186186

187187
const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images);
@@ -193,11 +193,19 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
193193
m_inputs_embedder->set_apply_chat_template_status(false);
194194

195195
if (m_inputs_embedder->has_token_type_ids()) {
196-
auto [embeds, tt_ids] = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(templated_history, m_history_images, vlm_perf_metrics[0], rgbs.size() > 0, m_history_image_ids);
196+
auto [embeds, tt_ids] = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(templated_history,
197+
m_history_images,
198+
vlm_perf_metrics[0],
199+
true,
200+
m_history_image_ids);
197201
input_embeds_list.push_back(std::move(embeds));
198202
token_type_ids_list.push_back(std::move(tt_ids));
199203
} else {
200-
input_embeds_list.emplace_back(m_inputs_embedder->get_inputs_embeds(templated_history, m_history_images, vlm_perf_metrics[0], rgbs.size() > 0, m_history_image_ids));
204+
input_embeds_list.emplace_back(m_inputs_embedder->get_inputs_embeds(templated_history,
205+
m_history_images,
206+
vlm_perf_metrics[0],
207+
true,
208+
m_history_image_ids));
201209
}
202210

203211
auto end_get_inputs_embeds = std::chrono::steady_clock::now();
@@ -206,11 +214,12 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
206214
} else {
207215
for (size_t i = 0; i < prompts.size(); i++) {
208216
const auto& prompt = prompts[i];
209-
const auto& rgbs = video_vector.empty() ? rgbs_vector[i] : video_vector[i];
210217

211218
auto start_get_inputs_embeds = std::chrono::steady_clock::now();
212219

213-
const auto encoded_images = m_inputs_embedder->encode_images(rgbs, rgbs_vector.empty());
220+
auto image_rgbs = rgbs_vector.size() > 0 ? rgbs_vector[i] : std::vector<ov::Tensor>{};
221+
auto video_rgbs = video_vector.size() > 0 ? video_vector[i] : std::vector<ov::Tensor>{};
222+
const auto encoded_images = m_inputs_embedder->encode_images(image_rgbs, video_rgbs);
214223
auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images);
215224

216225
m_inputs_embedder->set_apply_chat_template_status(sampling_params[i].apply_chat_template);
@@ -280,7 +289,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(uint64_t re
280289
std::lock_guard<std::mutex> lock(m_embeddings_mutex);
281290
m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template);
282291

283-
auto encoded_images = m_inputs_embedder->encode_images(rgbs, false);
292+
auto encoded_images = m_inputs_embedder->encode_images(rgbs, std::vector<ov::Tensor>{});
284293

285294
const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images);
286295
inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence);
@@ -295,15 +304,14 @@ GenerationHandle ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_re
295304
const std::vector<ov::Tensor>& video,
296305
GenerationConfig sampling_params) {
297306
OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS, "Model doesn't support embeddings.");
298-
OPENVINO_ASSERT((video.size() == 0u || images.size() == 0u), "Only support one input, video or images.");
307+
299308
ov::genai::VLMPerfMetrics metrics;
300309
ov::Tensor inputs;
301310
{
302311
std::lock_guard<std::mutex> lock(m_embeddings_mutex);
303312
m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template);
304313

305-
auto encoded_images = video.size() == 0 ? m_inputs_embedder->encode_images(images, false)
306-
: m_inputs_embedder->encode_images(video, true);
314+
auto encoded_images =m_inputs_embedder->encode_images(images, video);
307315

308316
const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images);
309317
inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence);

src/cpp/src/visual_language/gemma3/classes.cpp

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,20 +71,16 @@ bool InputsEmbedderGemma3::has_token_type_ids() const {
7171
return true;
7272
}
7373

74-
std::vector<ov::genai::EncodedImage> InputsEmbedderGemma3::encode_images(const std::vector<ov::Tensor>& images, const bool& is_video) {
74+
std::vector<ov::genai::EncodedImage> InputsEmbedderGemma3::encode_images(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video) {
75+
if (video.size() > 0) {
76+
OPENVINO_THROW("Gemma3 doesn't support video preprocess currently. Input images are processed as separate images.");
77+
}
78+
7579
std::vector<EncodedImage> embeds;
7680

7781
ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}};
7882

7983
std::vector<ov::Tensor> single_images = to_single_image_tensors(images);
80-
if (is_video) {
81-
embeds = m_vision_encoder->encode_video(single_images, vision_config);
82-
if (!embeds.empty()) {
83-
return embeds;
84-
}
85-
// Fallback to image process.
86-
}
87-
8884
embeds.reserve(single_images.size());
8985
for (const ov::Tensor& image : single_images) {
9086
embeds.emplace_back(m_vision_encoder->encode(image, vision_config));

src/cpp/src/visual_language/gemma3/classes.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ class InputsEmbedderGemma3 : public InputsEmbedder::IInputsEmbedder {
4141

4242
bool has_token_type_ids() const override;
4343

44-
std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images, const bool& is_video = false) override;
44+
std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video) override;
4545

4646
std::pair<std::string, std::vector<size_t>> normalize_prompt(const std::string& prompt, size_t base_id, const std::vector<EncodedImage>& images) const override;
4747

src/cpp/src/visual_language/inputs_embedder.cpp

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -165,31 +165,33 @@ std::vector<ov::Tensor> InputsEmbedder::IInputsEmbedder::to_single_image_tensors
165165
return single_image_tensors;
166166
}
167167

168-
std::vector<ov::genai::EncodedImage> InputsEmbedder::IInputsEmbedder::encode_images(const std::vector<ov::Tensor>& images, const bool& is_video) {
169-
std::vector<ov::Tensor> single_images = to_single_image_tensors(images);
168+
std::vector<ov::genai::EncodedImage> InputsEmbedder::IInputsEmbedder::encode_images(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video) {
170169
std::vector<EncodedImage> embeds;
171170

172-
if (is_video) {
173-
return m_vision_encoder->encode_video(single_images);
171+
for (const ov::Tensor& single_video : video) {
172+
std::vector<ov::Tensor> single_frames = to_single_image_tensors({single_video});
173+
auto embeds_video = m_vision_encoder->encode_video(single_frames);
174+
embeds.insert(embeds.end(), embeds_video.begin(), embeds_video.end());
174175
}
175176

177+
std::vector<ov::Tensor> single_images = to_single_image_tensors(images);
176178
for (const ov::Tensor& image : single_images) {
177179
embeds.emplace_back(m_vision_encoder->encode(image));
178180
}
179181
return embeds;
180182
}
181183

182-
ov::Tensor InputsEmbedder::IInputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, const bool& is_video, ov::genai::VLMPerfMetrics& metrics, const std::vector<size_t>& image_sequence) {
183-
return get_inputs_embeds(prompt, encode_images(images, is_video), metrics, true, image_sequence);
184+
ov::Tensor InputsEmbedder::IInputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video, ov::genai::VLMPerfMetrics& metrics, const std::vector<size_t>& image_sequence) {
185+
return get_inputs_embeds(prompt, encode_images(images, video), metrics, true, image_sequence);
184186
}
185187

186188
std::pair<ov::Tensor, ov::Tensor> InputsEmbedder::IInputsEmbedder::get_inputs_embeds_with_token_type_ids(
187189
const std::string& prompt,
188190
const std::vector<ov::Tensor>& images,
189-
const bool& is_video,
191+
const std::vector<ov::Tensor>& video,
190192
ov::genai::VLMPerfMetrics& metrics,
191193
const std::vector<size_t>& image_sequence) {
192-
return get_inputs_embeds_with_token_type_ids(prompt, encode_images(images, is_video), metrics, true, image_sequence);
194+
return get_inputs_embeds_with_token_type_ids(prompt, encode_images(images, video), metrics, true, image_sequence);
193195
}
194196

195197
std::pair<ov::Tensor, ov::Tensor> InputsEmbedder::IInputsEmbedder::get_inputs_embeds_with_token_type_ids(
@@ -267,8 +269,8 @@ InputsEmbedder::InputsEmbedder(const ModelsMap& models_map,
267269
}
268270
}
269271

270-
ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, const bool& is_video, ov::genai::VLMPerfMetrics& metrics, const std::vector<size_t>& image_sequence) {
271-
return m_impl->get_inputs_embeds(prompt, images, is_video, metrics, image_sequence);
272+
ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video, ov::genai::VLMPerfMetrics& metrics, const std::vector<size_t>& image_sequence) {
273+
return m_impl->get_inputs_embeds(prompt, images, video, metrics, image_sequence);
272274
}
273275

274276
ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings, const std::vector<size_t>& image_sequence) {
@@ -278,11 +280,10 @@ ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const st
278280
std::pair<ov::Tensor, ov::Tensor> InputsEmbedder::get_inputs_embeds_with_token_type_ids(
279281
const std::string& prompt,
280282
const std::vector<ov::Tensor>& images,
281-
const bool& is_video,
283+
const std::vector<ov::Tensor>& video,
282284
VLMPerfMetrics& metrics,
283285
const std::vector<size_t>& image_sequence) {
284-
return m_impl->get_inputs_embeds_with_token_type_ids(
285-
prompt, images, is_video, metrics, image_sequence);
286+
return m_impl->get_inputs_embeds_with_token_type_ids(prompt, images, video, metrics, image_sequence);
286287
}
287288

288289
std::pair<ov::Tensor, ov::Tensor> InputsEmbedder::get_inputs_embeds_with_token_type_ids(
@@ -299,8 +300,8 @@ bool InputsEmbedder::has_token_type_ids() const {
299300
return m_impl->has_token_type_ids();
300301
}
301302

302-
std::vector<ov::genai::EncodedImage> InputsEmbedder::encode_images(const std::vector<ov::Tensor>& images, const bool& is_video) {
303-
return m_impl->encode_images(images, is_video);
303+
std::vector<ov::genai::EncodedImage> InputsEmbedder::encode_images(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video) {
304+
return m_impl->encode_images(images, video);
304305
}
305306

306307
std::pair<ov::Tensor, std::optional<int64_t>> InputsEmbedder::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) {

src/cpp/src/visual_language/inputs_embedder.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,18 +35,18 @@ class InputsEmbedder {
3535
const ov::AnyMap device_config);
3636

3737
// compute input embedding for prompt and multiple images
38-
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, const bool& is_video, ov::genai::VLMPerfMetrics& metrics, const std::vector<size_t>& image_sequence);
38+
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video, ov::genai::VLMPerfMetrics& metrics, const std::vector<size_t>& image_sequence);
3939

4040
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector<size_t>& image_sequence = {});
4141

4242
// compute input embedding and token_type_ids
43-
std::pair<ov::Tensor, ov::Tensor> get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector<ov::Tensor>& images, const bool& is_video, VLMPerfMetrics& metrics, const std::vector<size_t>& image_sequence = {});
43+
std::pair<ov::Tensor, ov::Tensor> get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video, VLMPerfMetrics& metrics, const std::vector<size_t>& image_sequence = {});
4444

4545
std::pair<ov::Tensor, ov::Tensor> get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector<EncodedImage>& images, VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector<size_t>& image_sequence = {});
4646

4747
bool has_token_type_ids() const;
4848

49-
std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images, const bool& is_video = false);
49+
std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video);
5050

5151
// compute position ids for language model input
5252
std::pair<ov::Tensor, std::optional<int64_t>> get_position_ids(const size_t inputs_embeds_size, const size_t history_size);
@@ -108,15 +108,15 @@ class InputsEmbedder {
108108
public:
109109
virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector<size_t>& image_sequence = {}) = 0;
110110

111-
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, const bool& is_video, ov::genai::VLMPerfMetrics& metrics, const std::vector<size_t>& image_sequence);
111+
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video, ov::genai::VLMPerfMetrics& metrics, const std::vector<size_t>& image_sequence);
112112

113-
std::pair<ov::Tensor, ov::Tensor> get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector<ov::Tensor>& images, const bool& is_video, ov::genai::VLMPerfMetrics& metrics, const std::vector<size_t>& image_sequence = {});
113+
std::pair<ov::Tensor, ov::Tensor> get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video, ov::genai::VLMPerfMetrics& metrics, const std::vector<size_t>& image_sequence = {});
114114

115115
virtual std::pair<ov::Tensor, ov::Tensor> get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector<size_t>& image_sequence = {});
116116

117117
virtual bool has_token_type_ids() const;
118118

119-
virtual std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images, const bool& is_video = false);
119+
virtual std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video);
120120

121121
virtual std::pair<ov::Tensor, std::optional<int64_t>> get_position_ids(const size_t inputs_embeds_size, const size_t history_size);
122122

src/cpp/src/visual_language/llava/classes.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,11 @@ InputsEmbedderLLaVA::InputsEmbedderLLaVA(
9292
const ov::AnyMap device_config) :
9393
IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { }
9494

95-
std::vector<ov::genai::EncodedImage> InputsEmbedderLLaVA::encode_images(const std::vector<ov::Tensor>& images, const bool& is_video) {
96-
if (is_video) {
97-
Logger::warn("LLaVA doesn't support video preprocess currently. Input images are processed as separate images.");
95+
std::vector<ov::genai::EncodedImage> InputsEmbedderLLaVA::encode_images(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video) {
96+
if (video.size() > 0) {
97+
OPENVINO_THROW("LLaVA doesn't support video preprocess currently. Input images are processed as separate images.");
9898
}
99+
99100
std::vector<EncodedImage> embeds;
100101
ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}};
101102
std::vector<ov::Tensor> single_images = to_single_image_tensors(images);

src/cpp/src/visual_language/llava/classes.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
3737

3838
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector<size_t>& image_sequence = {}) override;
3939

40-
std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images, const bool& is_video = false) override;
40+
std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video) override;
4141

4242
std::pair<std::string, std::vector<size_t>> normalize_prompt(
4343
const std::string& prompt,

src/cpp/src/visual_language/llava_next/classes.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -333,9 +333,9 @@ ov::Tensor pack_image_features_llava_next(
333333

334334
} // namespace
335335

336-
std::vector<ov::genai::EncodedImage> InputsEmbedderLLaVANext::encode_images(const std::vector<ov::Tensor>& images, const bool& is_video) {
337-
if (is_video) {
338-
Logger::warn("LLaVANext doesn't support video preprocess currently. Input images are processed as separate images.");
336+
std::vector<ov::genai::EncodedImage> InputsEmbedderLLaVANext::encode_images(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video) {
337+
if (video.size() > 0) {
338+
OPENVINO_THROW("LLaVANext doesn't support video preprocess currently. Input images are processed as separate images.");
339339
}
340340

341341
std::vector<EncodedImage> embeds;

src/cpp/src/visual_language/llava_next/classes.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
2424

2525
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector<size_t>& image_sequence = {}) override;
2626

27-
std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images, const bool& is_video = false) override;
27+
std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& video) override;
2828

2929
std::pair<std::string, std::vector<size_t>> normalize_prompt(
3030
const std::string& prompt,

0 commit comments

Comments
 (0)