Skip to content

Commit 515c911

Browse files
committed
Call images when the models not implement video process.
Signed-off-by: xiping.yan <[email protected]>
1 parent 123221b commit 515c911

File tree

3 files changed

+75
-7
lines changed

3 files changed

+75
-7
lines changed

src/cpp/src/continuous_batching/pipeline_base.cpp

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -203,12 +203,13 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
203203
m_inputs_embedder->set_apply_chat_template_status(false);
204204

205205
if (m_inputs_embedder->has_token_type_ids()) {
206-
// Todo: support video
207206
auto [embeds, tt_ids] = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(templated_history,
208207
m_history_images,
208+
m_history_videos,
209209
vlm_perf_metrics[0],
210210
true,
211-
m_history_image_ids);
211+
m_history_image_ids,
212+
m_history_video_ids);
212213
input_embeds_list.push_back(std::move(embeds));
213214
token_type_ids_list.push_back(std::move(tt_ids));
214215
} else {
@@ -244,11 +245,24 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
244245
m_inputs_embedder->set_apply_chat_template_status(sampling_params[i].apply_chat_template);
245246

246247
if (m_inputs_embedder->has_token_type_ids()) {
247-
auto [embeds, tt_ids] = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(norm_prompt.unified_prompt, encoded_images, vlm_perf_metrics[i], true, norm_prompt.images_sequence);
248+
auto [embeds, tt_ids] =
249+
m_inputs_embedder->get_inputs_embeds_with_token_type_ids(norm_prompt.unified_prompt,
250+
encoded_images,
251+
encoded_videos,
252+
vlm_perf_metrics[i],
253+
true,
254+
norm_prompt.images_sequence,
255+
norm_prompt.videos_sequence);
248256
input_embeds_list.push_back(std::move(embeds));
249257
token_type_ids_list.push_back(std::move(tt_ids));
250258
} else {
251-
input_embeds_list.emplace_back(m_inputs_embedder->get_inputs_embeds(norm_prompt.unified_prompt, encoded_images, vlm_perf_metrics[i], true, norm_prompt.images_sequence));
259+
input_embeds_list.emplace_back(m_inputs_embedder->get_inputs_embeds(norm_prompt.unified_prompt,
260+
encoded_images,
261+
encoded_videos,
262+
vlm_perf_metrics[i],
263+
true,
264+
norm_prompt.images_sequence,
265+
norm_prompt.videos_sequence));
252266
}
253267

254268
auto end_get_inputs_embeds = std::chrono::steady_clock::now();

src/cpp/src/visual_language/inputs_embedder.cpp

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -182,11 +182,15 @@ ov::Tensor InputsEmbedder::IInputsEmbedder::get_inputs_embeds(
182182
bool recalculate_merged_embeddings,
183183
const std::vector<size_t>& images_sequence,
184184
const std::vector<size_t>& videos_sequence) {
185-
OPENVINO_THROW("Current model doesn't support video preprocess currently. Input images are processed as separate images.");
185+
if (videos.size() > 0) {
186+
OPENVINO_THROW("The model doesn't support 'videos' preprocessing yet. Please use 'images' instead.");
187+
} else {
188+
return get_inputs_embeds(prompt, images, metrics, recalculate_merged_embeddings, images_sequence);
189+
}
186190
}
187191

188192
std::vector<ov::genai::EncodedImage> InputsEmbedder::IInputsEmbedder::encode_video(const std::vector<ov::Tensor>& videos) {
189-
OPENVINO_THROW("Current model doesn't support videos preprocess currently. Input images are processed as separate images.");
193+
OPENVINO_THROW("The model doesn't support 'videos' preprocessing yet. Please use 'images' instead.");
190194
}
191195

192196
NormlizedPrompt InputsEmbedder::IInputsEmbedder::normalize_prompt(
@@ -195,7 +199,13 @@ NormlizedPrompt InputsEmbedder::IInputsEmbedder::normalize_prompt(
195199
size_t video_base_id,
196200
const std::vector<EncodedImage>& images,
197201
const std::vector<std::vector<EncodedImage>>& videos) const {
198-
OPENVINO_THROW("Current model doesn't support video preprocess currently. Input images are processed as separate images.");
202+
if (videos.size() > 0) {
203+
OPENVINO_THROW("The model doesn't support 'videos' preprocessing yet. Please use 'images' instead.");
204+
} else {
205+
NormlizedPrompt norm_prompt;
206+
std::tie(norm_prompt.unified_prompt, norm_prompt.images_sequence) = normalize_prompt(prompt, base_id, images);
207+
return norm_prompt;
208+
}
199209
}
200210

201211
std::pair<ov::Tensor, ov::Tensor> InputsEmbedder::IInputsEmbedder::get_inputs_embeds_with_token_type_ids(
@@ -207,6 +217,21 @@ std::pair<ov::Tensor, ov::Tensor> InputsEmbedder::IInputsEmbedder::get_inputs_em
207217
OPENVINO_THROW("This model does not support token_type_ids.");
208218
}
209219

220+
std::pair<ov::Tensor, ov::Tensor> InputsEmbedder::IInputsEmbedder::get_inputs_embeds_with_token_type_ids(
221+
const std::string& prompt,
222+
const std::vector<EncodedImage>& images,
223+
const std::vector<std::vector<ov::genai::EncodedImage>>& videos,
224+
VLMPerfMetrics& metrics,
225+
bool recalculate_merged_embeddings,
226+
const std::vector<size_t>& image_sequence,
227+
const std::vector<size_t>& videos_sequence) {
228+
if (videos.size() > 0) {
229+
OPENVINO_THROW("The model doesn't support 'videos' preprocessing yet. Please use 'images' instead.");
230+
} else {
231+
return get_inputs_embeds_with_token_type_ids(prompt, images, metrics, recalculate_merged_embeddings, image_sequence);
232+
}
233+
}
234+
210235
bool InputsEmbedder::IInputsEmbedder::has_token_type_ids() const { return false; }
211236

212237
/// Public InputsEmbedder class
@@ -303,6 +328,18 @@ std::pair<ov::Tensor, ov::Tensor> InputsEmbedder::get_inputs_embeds_with_token_t
303328
prompt, images, metrics, recalculate_merged_embeddings, image_sequence);
304329
}
305330

331+
std::pair<ov::Tensor, ov::Tensor> InputsEmbedder::get_inputs_embeds_with_token_type_ids(
332+
const std::string& prompt,
333+
const std::vector<EncodedImage>& images,
334+
const std::vector<std::vector<ov::genai::EncodedImage>>& videos,
335+
VLMPerfMetrics& metrics,
336+
bool recalculate_merged_embeddings,
337+
const std::vector<size_t>& image_sequence,
338+
const std::vector<size_t>& videos_sequence) {
339+
return m_impl->get_inputs_embeds_with_token_type_ids(
340+
prompt, images, videos, metrics, recalculate_merged_embeddings, image_sequence, videos_sequence);
341+
}
342+
306343
bool InputsEmbedder::has_token_type_ids() const {
307344
return m_impl->has_token_type_ids();
308345
}

src/cpp/src/visual_language/inputs_embedder.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,15 @@ class InputsEmbedder {
4747
// compute input embedding and token_type_ids
4848
std::pair<ov::Tensor, ov::Tensor> get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector<EncodedImage>& images, VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector<size_t>& image_sequence = {});
4949

50+
std::pair<ov::Tensor, ov::Tensor> get_inputs_embeds_with_token_type_ids(
51+
const std::string& prompt,
52+
const std::vector<ov::genai::EncodedImage>& images,
53+
const std::vector<std::vector<ov::genai::EncodedImage>>& videos,
54+
ov::genai::VLMPerfMetrics& metrics,
55+
bool recalculate_merged_embeddings = true,
56+
const std::vector<size_t>& image_sequence = {},
57+
const std::vector<size_t>& videos_sequence = {});
58+
5059
bool has_token_type_ids() const;
5160

5261
std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images);
@@ -128,6 +137,14 @@ class InputsEmbedder {
128137
const std::vector<size_t>& videos_sequence = {});
129138

130139
virtual std::pair<ov::Tensor, ov::Tensor> get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector<size_t>& image_sequence = {});
140+
virtual std::pair<ov::Tensor, ov::Tensor> get_inputs_embeds_with_token_type_ids(
141+
const std::string& prompt,
142+
const std::vector<ov::genai::EncodedImage>& images,
143+
const std::vector<std::vector<ov::genai::EncodedImage>>& videos,
144+
ov::genai::VLMPerfMetrics& metrics,
145+
bool recalculate_merged_embeddings = true,
146+
const std::vector<size_t>& image_sequence = {},
147+
const std::vector<size_t>& videos_sequence = {});
131148

132149
virtual bool has_token_type_ids() const;
133150

0 commit comments

Comments
 (0)