@@ -51,7 +51,8 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
51
51
// TODO: remove this code and within model runner add check: if sequence group type is tokens,
52
52
// but embedding model is available => compute embeddings first, then pass to LLM
53
53
std::vector<std::vector<ov::Tensor>> images (prompts.size ());
54
- auto results_vlm = generate (prompts, images, sampling_params, streamer);
54
+ std::vector<std::vector<ov::Tensor>> videos (prompts.size ());
55
+ auto results_vlm = generate (prompts, images, videos, sampling_params, streamer);
55
56
std::vector<GenerationResult> resutls;
56
57
for (auto & vlm_result : results_vlm) {
57
58
GenerationResult result;
@@ -150,13 +151,15 @@ std::vector<VLMDecodedResults>
150
151
ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate (
151
152
const std::vector<std::string>& prompts,
152
153
const std::vector<std::vector<ov::Tensor>>& rgbs_vector,
154
+ const std::vector<std::vector<ov::Tensor>>& video_vector,
153
155
const std::vector<GenerationConfig>& sampling_params,
154
156
const StreamerVariant& streamer) {
155
157
auto generate_start_time = std::chrono::steady_clock::now ();
156
158
OPENVINO_ASSERT (m_model_input_type == ModelInputType::EMBEDDINGS);
157
159
158
160
OPENVINO_ASSERT (prompts.size () == sampling_params.size (), " Number of prompts should be equal to the number of generation configs." );
159
161
OPENVINO_ASSERT (prompts.size () == rgbs_vector.size (), " Number of prompts should be equal to the number of images vectors." );
162
+ OPENVINO_ASSERT (prompts.size () == video_vector.size (), " Number of prompts should be equal to the number of video vectors." );
160
163
161
164
std::vector<ov::Tensor> input_embeds_list;
162
165
std::vector<VLMPerfMetrics> vlm_perf_metrics (prompts.size ());
@@ -165,9 +168,14 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
165
168
if (m_is_chat_conversation) {
166
169
OPENVINO_ASSERT (1 == prompts.size (), " Can't chat with multiple prompts" );
167
170
const auto & rgbs = rgbs_vector[0 ];
171
+ const auto & video = video_vector[0 ];
168
172
const auto & prompt = prompts[0 ];
169
173
auto start_get_inputs_embeds = std::chrono::steady_clock::now ();
170
- encoded_images = m_inputs_embedder->encode_images (rgbs, sampling_params[0 ].is_video );
174
+ if (rgbs.size () > 0 ) {
175
+ encoded_images = m_inputs_embedder->encode_images (rgbs, false );
176
+ } else if (video.size () > 0 ) {
177
+ encoded_images = m_inputs_embedder->encode_images (video, true );
178
+ }
171
179
m_history_images.insert (m_history_images.end (), encoded_images.begin (), encoded_images.end ());
172
180
173
181
const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt (prompt, m_image_id, encoded_images);
@@ -177,15 +185,26 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
177
185
std::string templated_history = m_tokenizer.apply_chat_template (m_history, true );
178
186
179
187
m_inputs_embedder->set_apply_chat_template_status (false );
180
- input_embeds_list.push_back (m_inputs_embedder->get_inputs_embeds (templated_history, m_history_images, vlm_perf_metrics[0 ], rgbs.size () > 0 , m_history_image_ids));
188
+ input_embeds_list.push_back (m_inputs_embedder->get_inputs_embeds (templated_history,
189
+ m_history_images,
190
+ vlm_perf_metrics[0 ],
191
+ encoded_images.size () > 0 ,
192
+ m_history_image_ids));
181
193
auto end_get_inputs_embeds = std::chrono::steady_clock::now ();
182
194
vlm_perf_metrics[0 ].vlm_raw_metrics .prepare_embeddings_durations .emplace_back (PerfMetrics::get_microsec (end_get_inputs_embeds - start_get_inputs_embeds));
183
195
184
196
} else {
185
197
for (size_t i = 0 ; i < prompts.size (); i++) {
186
198
const auto & prompt = prompts[i];
187
199
const auto & rgbs = rgbs_vector[i];
188
- const auto encoded_images = m_inputs_embedder->encode_images (rgbs, sampling_params[i].is_video );
200
+ const auto & video = video_vector[i];
201
+ std::vector<ov::genai::EncodedImage> encoded_images;
202
+ if (rgbs.size () > 0 ) {
203
+ encoded_images = m_inputs_embedder->encode_images (rgbs, false );
204
+ } else if (video.size () > 0 ) {
205
+ encoded_images = m_inputs_embedder->encode_images (video, true );
206
+ }
207
+
189
208
auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt (prompt, m_image_id, encoded_images);
190
209
191
210
auto start_get_inputs_embeds = std::chrono::steady_clock::now ();
@@ -241,14 +260,21 @@ GenerationHandle
241
260
ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request (uint64_t request_id,
242
261
const std::string& prompt,
243
262
const std::vector<ov::Tensor>& rgbs,
263
+ const std::vector<ov::Tensor>& video,
244
264
GenerationConfig sampling_params) {
245
265
OPENVINO_ASSERT (m_model_input_type == ModelInputType::EMBEDDINGS, " Model doesn't support embeddings." );
246
266
ov::genai::VLMPerfMetrics metrics;
247
267
ov::Tensor inputs;
248
268
{
249
269
std::lock_guard<std::mutex> lock (m_embeddings_mutex);
250
270
m_inputs_embedder->set_apply_chat_template_status (sampling_params.apply_chat_template );
251
- const auto encoded_images = m_inputs_embedder->encode_images (rgbs, sampling_params.is_video );
271
+
272
+ std::vector<ov::genai::EncodedImage> encoded_images;
273
+ if (rgbs.size () > 0 ) {
274
+ encoded_images = m_inputs_embedder->encode_images (rgbs, false );
275
+ } else if (video.size () > 0 ) {
276
+ encoded_images = m_inputs_embedder->encode_images (video, true );
277
+ }
252
278
253
279
const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt (prompt, 0 , encoded_images);
254
280
inputs = m_inputs_embedder->get_inputs_embeds (unified_prompt, encoded_images, metrics, true , image_sequence);
0 commit comments