Skip to content

Commit edd75d8

Browse files
committed
1: remove is_video,
2: add ov::Properity::video Signed-off-by: xipingya <[email protected]>
1 parent 6281335 commit edd75d8

File tree

14 files changed

+105
-24
lines changed

14 files changed

+105
-24
lines changed

src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,11 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
165165
/// @param request_id must be unique for every add_request() call.
166166
GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params);
167167
GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params);
168-
GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const std::vector<ov::Tensor>& images, const ov::genai::GenerationConfig& sampling_params);
168+
GenerationHandle add_request(uint64_t request_id,
169+
const std::string& prompt,
170+
const std::vector<ov::Tensor>& images,
171+
const std::vector<ov::Tensor>& video,
172+
const ov::genai::GenerationConfig& sampling_params);
169173

170174
void step();
171175

@@ -177,6 +181,7 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
177181
std::vector<VLMDecodedResults> generate(
178182
const std::vector<std::string>& prompts,
179183
const std::vector<std::vector<ov::Tensor>>& images,
184+
const std::vector<std::vector<ov::Tensor>>& videos,
180185
const std::vector<GenerationConfig>& sampling_params,
181186
const StreamerVariant& streamer=std::monostate{});
182187
/**

src/cpp/include/openvino/genai/generation_config.hpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,8 +212,6 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
212212
// set to true if chat template should be applied for non-chat scenarios, set to false otherwise
213213
bool apply_chat_template = true;
214214

215-
// Vidoe or image
216-
bool is_video = false;
217215

218216
/** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0.
219217
* Otherwise verifies eos_token_id == tokenizer_eos_token_id.

src/cpp/include/openvino/genai/visual_language/pipeline.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
9898
VLMDecodedResults generate(
9999
const std::string& prompt,
100100
const std::vector<ov::Tensor>& rgbs,
101+
const std::vector<ov::Tensor>& video,
101102
const GenerationConfig& generation_config,
102103
const StreamerVariant& streamer
103104
);
@@ -235,7 +236,10 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
235236
/*
236237
* utils that allow to use generate() in the following way:
237238
* pipe.generate(prompt, ov::genai::image(image_tensor)).
239+
* pipe.generate(prompt, ov::genai::images(image_tensors)).
240+
* pipe.generate(prompt, ov::genai::video(video_tensors)).
238241
*/
239242
static constexpr ov::Property<ov::Tensor> image{"image"};
240243
static constexpr ov::Property<std::vector<ov::Tensor>> images{"images"};
244+
static constexpr ov::Property<std::vector<ov::Tensor>> video{"video"};
241245
}

src/cpp/src/continuous_batching/pipeline.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -237,8 +237,12 @@ GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, co
237237
return m_impl->add_request(request_id, input_ids, sampling_params);
238238
}
239239

240-
GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const std::string& prompt, const std::vector<ov::Tensor>& images, const ov::genai::GenerationConfig& sampling_params) {
241-
return m_impl->add_request(request_id, prompt, images, sampling_params);
240+
GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id,
241+
const std::string& prompt,
242+
const std::vector<ov::Tensor>& images,
243+
const std::vector<ov::Tensor>& video,
244+
const ov::genai::GenerationConfig& sampling_params) {
245+
return m_impl->add_request(request_id, prompt, images, video, sampling_params);
242246
}
243247

244248
void ContinuousBatchingPipeline::step() {
@@ -272,9 +276,10 @@ std::vector<GenerationResult> ContinuousBatchingPipeline::generate(const std::ve
272276
std::vector<VLMDecodedResults> ContinuousBatchingPipeline::generate(
273277
const std::vector<std::string>& prompts,
274278
const std::vector<std::vector<ov::Tensor>>& images,
279+
const std::vector<std::vector<ov::Tensor>>& videos,
275280
const std::vector<GenerationConfig>& sampling_params,
276281
const StreamerVariant& streamer) {
277-
return m_impl->generate(prompts, images, sampling_params, streamer);
282+
return m_impl->generate(prompts, images, videos, sampling_params, streamer);
278283
}
279284

280285

src/cpp/src/continuous_batching/pipeline_base.cpp

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
5151
// TODO: remove this code and within model runner add check: if sequence group type is tokens,
5252
// but embedding model is available => compute embeddings first, then pass to LLM
5353
std::vector<std::vector<ov::Tensor>> images(prompts.size());
54-
auto results_vlm = generate(prompts, images, sampling_params, streamer);
54+
std::vector<std::vector<ov::Tensor>> videos(prompts.size());
55+
auto results_vlm = generate(prompts, images, videos, sampling_params, streamer);
5556
std::vector<GenerationResult> resutls;
5657
for (auto& vlm_result : results_vlm) {
5758
GenerationResult result;
@@ -150,13 +151,15 @@ std::vector<VLMDecodedResults>
150151
ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
151152
const std::vector<std::string>& prompts,
152153
const std::vector<std::vector<ov::Tensor>>& rgbs_vector,
154+
const std::vector<std::vector<ov::Tensor>>& video_vector,
153155
const std::vector<GenerationConfig>& sampling_params,
154156
const StreamerVariant& streamer) {
155157
auto generate_start_time = std::chrono::steady_clock::now();
156158
OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS);
157159

158160
OPENVINO_ASSERT(prompts.size() == sampling_params.size(), "Number of prompts should be equal to the number of generation configs.");
159161
OPENVINO_ASSERT(prompts.size() == rgbs_vector.size(), "Number of prompts should be equal to the number of images vectors.");
162+
OPENVINO_ASSERT(prompts.size() == video_vector.size(), "Number of prompts should be equal to the number of video vectors.");
160163

161164
std::vector<ov::Tensor> input_embeds_list;
162165
std::vector<VLMPerfMetrics> vlm_perf_metrics(prompts.size());
@@ -165,9 +168,14 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
165168
if (m_is_chat_conversation) {
166169
OPENVINO_ASSERT(1 == prompts.size(), "Can't chat with multiple prompts");
167170
const auto& rgbs = rgbs_vector[0];
171+
const auto& video = video_vector[0];
168172
const auto& prompt = prompts[0];
169173
auto start_get_inputs_embeds = std::chrono::steady_clock::now();
170-
encoded_images = m_inputs_embedder->encode_images(rgbs, sampling_params[0].is_video);
174+
if (rgbs.size() > 0) {
175+
encoded_images = m_inputs_embedder->encode_images(rgbs, false);
176+
} else if (video.size() > 0) {
177+
encoded_images = m_inputs_embedder->encode_images(video, true);
178+
}
171179
m_history_images.insert(m_history_images.end(), encoded_images.begin(), encoded_images.end());
172180

173181
const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images);
@@ -177,15 +185,26 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
177185
std::string templated_history = m_tokenizer.apply_chat_template(m_history, true);
178186

179187
m_inputs_embedder->set_apply_chat_template_status(false);
180-
input_embeds_list.push_back(m_inputs_embedder->get_inputs_embeds(templated_history, m_history_images, vlm_perf_metrics[0], rgbs.size() > 0, m_history_image_ids));
188+
input_embeds_list.push_back(m_inputs_embedder->get_inputs_embeds(templated_history,
189+
m_history_images,
190+
vlm_perf_metrics[0],
191+
encoded_images.size() > 0,
192+
m_history_image_ids));
181193
auto end_get_inputs_embeds = std::chrono::steady_clock::now();
182194
vlm_perf_metrics[0].vlm_raw_metrics.prepare_embeddings_durations.emplace_back(PerfMetrics::get_microsec(end_get_inputs_embeds - start_get_inputs_embeds));
183195

184196
} else {
185197
for (size_t i = 0; i < prompts.size(); i++) {
186198
const auto& prompt = prompts[i];
187199
const auto& rgbs = rgbs_vector[i];
188-
const auto encoded_images = m_inputs_embedder->encode_images(rgbs, sampling_params[i].is_video);
200+
const auto& video = video_vector[i];
201+
std::vector<ov::genai::EncodedImage> encoded_images;
202+
if (rgbs.size() > 0) {
203+
encoded_images = m_inputs_embedder->encode_images(rgbs, false);
204+
} else if (video.size() > 0) {
205+
encoded_images = m_inputs_embedder->encode_images(video, true);
206+
}
207+
189208
auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images);
190209

191210
auto start_get_inputs_embeds = std::chrono::steady_clock::now();
@@ -241,14 +260,21 @@ GenerationHandle
241260
ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(uint64_t request_id,
242261
const std::string& prompt,
243262
const std::vector<ov::Tensor>& rgbs,
263+
const std::vector<ov::Tensor>& video,
244264
GenerationConfig sampling_params) {
245265
OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS, "Model doesn't support embeddings.");
246266
ov::genai::VLMPerfMetrics metrics;
247267
ov::Tensor inputs;
248268
{
249269
std::lock_guard<std::mutex> lock(m_embeddings_mutex);
250270
m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template);
251-
const auto encoded_images = m_inputs_embedder->encode_images(rgbs, sampling_params.is_video);
271+
272+
std::vector<ov::genai::EncodedImage> encoded_images;
273+
if (rgbs.size() > 0) {
274+
encoded_images = m_inputs_embedder->encode_images(rgbs, false);
275+
} else if (video.size() > 0) {
276+
encoded_images = m_inputs_embedder->encode_images(video, true);
277+
}
252278

253279
const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images);
254280
inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence);

src/cpp/src/continuous_batching/pipeline_base.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline {
9292
GenerationHandle add_request(uint64_t request_id,
9393
const std::string& prompt,
9494
const std::vector<ov::Tensor>& rgbs,
95+
const std::vector<ov::Tensor>& video,
9596
GenerationConfig sampling_params);
9697

9798
/**
@@ -124,6 +125,7 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline {
124125
generate(
125126
const std::vector<std::string>& prompts,
126127
const std::vector<std::vector<ov::Tensor>>& rgbs,
128+
const std::vector<std::vector<ov::Tensor>>& videos,
127129
const std::vector<GenerationConfig>& sampling_params,
128130
const StreamerVariant& streamer);
129131

src/cpp/src/continuous_batching/pipeline_impl.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request
238238
timer.end();
239239
return add_request(request_id, inputs, sampling_params);
240240
} else if (m_model_input_type == ModelInputType::EMBEDDINGS) {
241-
return ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(request_id, prompt, {}, sampling_params);
241+
return ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(request_id, prompt, {}, {}, sampling_params);
242242
} else {
243243
OPENVINO_THROW("Unknown model input type.");
244244
}

src/cpp/src/visual_language/continuous_batching_adapter.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,12 @@ class ov::genai::VLMPipeline::VLMContinuousBatchingAdapter : public ov::genai::V
4444
VLMDecodedResults generate(
4545
const std::string& prompt,
4646
const std::vector<ov::Tensor>& rgbs,
47+
const std::vector<ov::Tensor>& video,
4748
GenerationConfig generation_config,
4849
const StreamerVariant& streamer
4950
) override {
5051
auto start_time = std::chrono::steady_clock::now();
51-
auto result = m_impl.generate({prompt}, {rgbs}, {generation_config}, streamer)[0];
52+
auto result = m_impl.generate({prompt}, {rgbs}, {video}, {generation_config}, streamer)[0];
5253
auto stop_time = std::chrono::steady_clock::now();
5354

5455
VLMDecodedResults decoded;

src/cpp/src/visual_language/pipeline.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
153153
VLMDecodedResults generate(
154154
const std::string& prompt,
155155
const std::vector<ov::Tensor>& rgbs,
156+
const std::vector<ov::Tensor>& video,
156157
GenerationConfig generation_config,
157158
const StreamerVariant& streamer
158159
) override {
@@ -183,7 +184,12 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
183184
"Currently only \"num_return_sequences\" equal to 1 is supported for NPU device!");
184185
}
185186

186-
const auto encoded_images = m_inputs_embedder->encode_images(rgbs, generation_config.is_video);
187+
std::vector<ov::genai::EncodedImage> encoded_images;
188+
if (rgbs.size() > 0) {
189+
encoded_images = m_inputs_embedder->encode_images(rgbs, false);
190+
} else if (rgbs.size() > 0) {
191+
encoded_images = m_inputs_embedder->encode_images(video, true);
192+
}
187193
auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images);
188194

189195
if (m_is_chat_conversation) {
@@ -437,10 +443,11 @@ VLMPipeline::~VLMPipeline() = default;
437443
VLMDecodedResults VLMPipeline::generate(
438444
const std::string& prompt,
439445
const std::vector<ov::Tensor>& rgbs,
446+
const std::vector<ov::Tensor>& video,
440447
const GenerationConfig& generation_config,
441448
const StreamerVariant& streamer
442449
) {
443-
return m_pimpl->generate(prompt, rgbs, generation_config, streamer);
450+
return m_pimpl->generate(prompt, rgbs, video, generation_config, streamer);
444451
}
445452

446453
VLMDecodedResults VLMPipeline::generate(
@@ -449,7 +456,7 @@ VLMDecodedResults VLMPipeline::generate(
449456
const GenerationConfig& generation_config,
450457
const StreamerVariant& streamer
451458
) {
452-
return m_pimpl->generate(prompt, {rgb}, generation_config, streamer);
459+
return m_pimpl->generate(prompt, {rgb}, {}, generation_config, streamer);
453460
}
454461

455462
VLMDecodedResults VLMPipeline::generate(

src/cpp/src/visual_language/pipeline_base.hpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ class ov::genai::VLMPipeline::VLMPipelineBase {
2323
virtual VLMDecodedResults generate(
2424
const std::string& prompt,
2525
const std::vector<ov::Tensor>& rgbs,
26+
const std::vector<ov::Tensor>& video,
2627
GenerationConfig generation_config,
2728
const StreamerVariant& streamer
2829
) = 0;
@@ -33,6 +34,7 @@ class ov::genai::VLMPipeline::VLMPipelineBase {
3334
) {
3435
auto image = config_map.find(ov::genai::image.name());
3536
auto images = config_map.find(ov::genai::images.name());
37+
auto video = config_map.find(ov::genai::video.name());
3638
OPENVINO_ASSERT(
3739
config_map.end() == image || config_map.end() == images,
3840
"Only one property can be set: image of images."
@@ -52,13 +54,27 @@ class ov::genai::VLMPipeline::VLMPipelineBase {
5254
}
5355
}
5456

57+
std::vector<ov::Tensor> video_rgbs;
58+
if (config_map.end() != video) {
59+
if (video->second.is<std::vector<ov::Tensor>>()) {
60+
video_rgbs = video->second.as<std::vector<ov::Tensor>>();
61+
}
62+
else if (video->second.is<ov::Tensor>()){
63+
video_rgbs = {video->second.as<ov::Tensor>()};
64+
}
65+
else {
66+
OPENVINO_THROW("Unknown video type.");
67+
}
68+
}
69+
5570
ov::genai::OptionalGenerationConfig config_arg = utils::get_config_from_map(config_map);
5671
GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config();
5772
config.update_generation_config(config_map);
5873

5974
return generate(
6075
prompt,
6176
rgbs,
77+
video_rgbs,
6278
config,
6379
utils::get_streamer_from_map(config_map)
6480
);

0 commit comments

Comments
 (0)