Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
6c49dc8
Avoid to do resize for same width and height images.
xipingyan Jul 30, 2025
c7d9932
Enable video process for qwen*-vl
xipingyan Jul 30, 2025
2ee043f
Add python interface: generate config: is_video, default false.
xipingyan Jul 31, 2025
29c74fd
fallback video_encode to image encode in base class.
xipingyan Aug 5, 2025
78dac29
Update calc target image size.
xipingyan Aug 5, 2025
7b2c115
Reduce shared codes, fallback to image process via return empty vector;
xipingyan Aug 5, 2025
10d8e8d
1: remove is_video,
xipingyan Aug 9, 2025
a3000d4
Update src/cpp/src/visual_language/llava/classes.cpp
xipingyan Sep 11, 2025
062fc40
Merge branch 'master' into xp/enable_qwen_vl_video_preprocess
xipingyan Sep 12, 2025
4d8375d
Update src/cpp/src/visual_language/pipeline.cpp
xipingyan Sep 12, 2025
ef9f868
rename according to copilot suggestion
xipingyan Sep 12, 2025
ad95828
Merge branch 'xp/enable_qwen_vl_video_preprocess' of https://github.c…
xipingyan Sep 12, 2025
f92b19b
rename rgbs to images
xipingyan Sep 12, 2025
66cdf38
enable if node to unify image and video preprocess.
xipingyan Sep 15, 2025
3eda036
cpp preprocess: enable video preprecess.
xipingyan Sep 15, 2025
3df267f
Pass same_images
xipingyan Sep 15, 2025
bf3169b
add commments for same image
xipingyan Sep 15, 2025
e1250aa
Update loop condition, and rename variables.
xipingyan Sep 16, 2025
fe0ab92
Update src/cpp/src/visual_language/pipeline_base.hpp
xipingyan Sep 16, 2025
dec67b2
video should be frames.
xipingyan Sep 16, 2025
caee3fd
Add pytest for video input.
xipingyan Sep 16, 2025
6a49a48
Merge branch 'master' into xp/enable_qwen_vl_video_preprocess
xipingyan Sep 16, 2025
800638e
Merge branch 'master' into xp/enable_qwen_vl_video_preprocess
peterchen-intel Sep 17, 2025
1502b28
Remove is_video python attribute.
xipingyan Sep 17, 2025
4d8e867
rename video to videos
xipingyan Sep 17, 2025
ea7fc94
Update docs, and add video for add_request.
xipingyan Sep 17, 2025
60364bf
Fix docs format.
xipingyan Sep 17, 2025
4ea5b3d
Fix test error: can't catch exception.
xipingyan Sep 18, 2025
8a0ab2e
Fix: cannot be narrowed from type 'int' to 'float' in initializer list
xipingyan Sep 18, 2025
28337ea
Support no image or video input;
xipingyan Sep 18, 2025
f3fd7d4
Add checking input for python api.
xipingyan Sep 18, 2025
a80d28e
cpp interface: generate, remove video. add is_video, default false
xipingyan Sep 18, 2025
6ab0a35
update get_inputs_embeds_with_token_type_ids and get_inputs_embeds, i…
xipingyan Sep 18, 2025
c531982
Merge branch 'master' into xp/enable_qwen_vl_video_preprocess
xipingyan Sep 18, 2025
dc30ec1
update pyi interface of generate.
xipingyan Sep 19, 2025
5edf0a5
Remove "const bool& is_video" in add_request and generate.
xipingyan Sep 24, 2025
2215f8a
Update src/cpp/src/visual_language/qwen2vl/classes.cpp
xipingyan Sep 25, 2025
14352a7
Update src/python/openvino_genai/py_openvino_genai.pyi
xipingyan Sep 25, 2025
89afa54
copilot give a wrong suggestion. add images and video param for add_r…
xipingyan Sep 25, 2025
3b5c6cd
Merge remote-tracking branch 'origin/master' into xp/enable_qwen_vl_v…
xipingyan Sep 25, 2025
8768795
Add examples to .md
xipingyan Sep 25, 2025
be57bf2
Fix test video error, and input multiple images.
xipingyan Sep 25, 2025
d96c5dd
Update test based on 4D video.
xipingyan Sep 26, 2025
aaf20b0
Add vlm test dependency: opencv-python
xipingyan Sep 27, 2025
a2ad61b
Merge remote-tracking branch 'origin/master' into xp/enable_qwen_vl_v…
xipingyan Sep 27, 2025
6f5189b
Enable mix video and image input.
xipingyan Sep 27, 2025
c0829a3
split encode_images into encode_images and encode_video
xipingyan Sep 28, 2025
f25770b
Remove:
xipingyan Sep 28, 2025
72c621b
1: Add <video_pad> placeholder,
xipingyan Sep 28, 2025
132b228
Update position_ids after enable video.
xipingyan Sep 29, 2025
8c0e13d
add video histry id.
xipingyan Sep 30, 2025
64ba684
Update src/cpp/include/openvino/genai/visual_language/pipeline.hpp
xipingyan Sep 30, 2025
bbbef65
Merge branch 'xp/enable_qwen_vl_video_preprocess' of https://github.c…
xipingyan Sep 30, 2025
6e33dcf
Rename video to videos, reducing confusion.
xipingyan Sep 30, 2025
6bf63de
Remove useless header.
xipingyan Sep 30, 2025
eb4faea
Update video-> videos in Readme
xipingyan Sep 30, 2025
123221b
all video -> videos
xipingyan Sep 30, 2025
515c911
Call images when the models not implement video process.
xipingyan Sep 30, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,13 @@ image_data = ov.Tensor(image_data)

prompt = "Can you describe the image?"
result = pipe.generate(prompt, image=image_data, max_new_tokens=100)

# To input multiple images, use 'images='
# result = pipe.generate(prompt, images=[image_data], max_new_tokens=100)

# To input videos frames, use 'videos='
# result = pipe.generate(prompt, videos=[frames_data], max_new_tokens=100)

print(result.texts[0])
```

Expand All @@ -181,6 +188,12 @@ int main(int argc, char* argv[]) {
ov::genai::image(rgb),
ov::genai::max_new_tokens(100)
) << '\n';

// To input multiple images, use 'images'
// pipe.generate(prompt, ov::genai::images(std::vector<ov::Tensor>{rgb}), ov::genai::max_new_tokens(100));

// To input videos frames, use 'videos'
// pipe.generate(prompt, ov::genai::videos(std::vector<ov::Tensor>{frames}), ov::genai::max_new_tokens(100));
}
```

Expand Down
19 changes: 18 additions & 1 deletion src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,16 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
/// @param request_id must be unique for every add_request() call.
GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params);
GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params);
GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const std::vector<ov::Tensor>& images, const ov::genai::GenerationConfig& sampling_params);
GenerationHandle add_request(uint64_t request_id,
const std::string& prompt,
const std::vector<ov::Tensor>& images,
const ov::genai::GenerationConfig& sampling_params);

GenerationHandle add_request(uint64_t request_id,
const std::string& prompt,
const std::vector<ov::Tensor>& images,
const std::vector<ov::Tensor>& videos,
const ov::genai::GenerationConfig& sampling_params);

void step();

Expand All @@ -179,6 +188,14 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
const std::vector<std::vector<ov::Tensor>>& images,
const std::vector<GenerationConfig>& sampling_params,
const StreamerVariant& streamer=std::monostate{});

std::vector<VLMDecodedResults> generate(
const std::vector<std::string>& prompts,
const std::vector<std::vector<ov::Tensor>>& images,
const std::vector<std::vector<ov::Tensor>>& videos,
const std::vector<GenerationConfig>& sampling_params,
const StreamerVariant& streamer=std::monostate{});

/**
* @brief start chat with keeping history in kv cache.
* @param system_message optional system message.
Expand Down
24 changes: 23 additions & 1 deletion src/cpp/include/openvino/genai/visual_language/pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,26 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
/// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
VLMDecodedResults generate(
const std::string& prompt,
const std::vector<ov::Tensor>& rgbs,
const std::vector<ov::Tensor>& images,
const GenerationConfig& generation_config,
const StreamerVariant& streamer
);

/// @brief Generate a response given a prompt and any number of
/// uint8 RGB images with [NHWC] or [HWC] layout.
/// Or uint8 RGB video frames with [NHWC] layout, first dim means frames number.
/// @param prompt A prompt to respond to.
/// @param images Images to be prepended to a prompt.
/// @param videos Multiple videos, each providing multiple frames, to be prepended to a prompt.
/// @param generation_config A config to follow for text generation.
/// @param streamer A streamer to acquire intermediate result.
/// @return A string generated by a model.
/// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it.
/// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
VLMDecodedResults generate(
const std::string& prompt,
const std::vector<ov::Tensor>& images,
const std::vector<ov::Tensor>& videos,
const GenerationConfig& generation_config,
const StreamerVariant& streamer
);
Expand Down Expand Up @@ -244,7 +263,10 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
/*
* utils that allow to use generate() in the following way:
* pipe.generate(prompt, ov::genai::image(image_tensor)).
* pipe.generate(prompt, ov::genai::images(image_tensors)).
* pipe.generate(prompt, ov::genai::videos(videos_tensors)).
*/
static constexpr ov::Property<ov::Tensor> image{"image"};
static constexpr ov::Property<std::vector<ov::Tensor>> images{"images"};
static constexpr ov::Property<std::vector<ov::Tensor>> videos{"videos"};
}
21 changes: 20 additions & 1 deletion src/cpp/src/continuous_batching/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,10 +239,21 @@ GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, co
return m_impl->add_request(request_id, input_ids, sampling_params);
}

GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const std::string& prompt, const std::vector<ov::Tensor>& images, const ov::genai::GenerationConfig& sampling_params) {
GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id,
const std::string& prompt,
const std::vector<ov::Tensor>& images,
const ov::genai::GenerationConfig& sampling_params) {
return m_impl->add_request(request_id, prompt, images, sampling_params);
}

GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id,
const std::string& prompt,
const std::vector<ov::Tensor>& images,
const std::vector<ov::Tensor>& videos,
const ov::genai::GenerationConfig& sampling_params) {
return m_impl->add_request(request_id, prompt, images, videos, sampling_params);
}

void ContinuousBatchingPipeline::step() {
m_impl->step();
}
Expand Down Expand Up @@ -279,6 +290,14 @@ std::vector<VLMDecodedResults> ContinuousBatchingPipeline::generate(
return m_impl->generate(prompts, images, sampling_params, streamer);
}

std::vector<VLMDecodedResults> ContinuousBatchingPipeline::generate(
const std::vector<std::string>& prompts,
const std::vector<std::vector<ov::Tensor>>& images,
const std::vector<std::vector<ov::Tensor>>& videos,
const std::vector<GenerationConfig>& sampling_params,
const StreamerVariant& streamer) {
return m_impl->generate(prompts, images, videos, sampling_params, streamer);
}

void ContinuousBatchingPipeline::start_chat(const std::string& system_message) {
m_impl->finish_chat();
Expand Down
116 changes: 102 additions & 14 deletions src/cpp/src/continuous_batching/pipeline_base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,14 @@ void ContinuousBatchingPipeline::IContinuousBatchingPipeline::finish_chat() {
m_is_chat_conversation = false;
m_history.clear();
m_history_images.clear();
m_history_videos.clear();
m_history_image_ids.clear();
m_history_video_ids.clear();
if (m_inputs_embedder) {
m_inputs_embedder->finish_chat();
}
m_image_id = 0;
m_video_id = 0;
};

std::vector<GenerationResult>
Expand Down Expand Up @@ -151,40 +154,72 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
const std::vector<std::string>& prompts,
const std::vector<std::vector<ov::Tensor>>& rgbs_vector,
const std::vector<GenerationConfig>& sampling_params,
const StreamerVariant& streamer) {
return generate(prompts, rgbs_vector, {}, sampling_params, streamer);
}

std::vector<VLMDecodedResults>
ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
const std::vector<std::string>& prompts,
const std::vector<std::vector<ov::Tensor>>& rgbs_vector,
const std::vector<std::vector<ov::Tensor>>& video_vector,
const std::vector<GenerationConfig>& sampling_params,
const StreamerVariant& streamer) {
auto generate_start_time = std::chrono::steady_clock::now();
OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS);

OPENVINO_ASSERT(prompts.size() == sampling_params.size(), "Number of prompts should be equal to the number of generation configs.");
OPENVINO_ASSERT(prompts.size() == rgbs_vector.size(), "Number of prompts should be equal to the number of images vectors.");

std::vector<ov::Tensor> input_embeds_list;
std::vector<ov::Tensor> token_type_ids_list;

std::vector<VLMPerfMetrics> vlm_perf_metrics(prompts.size());
std::vector<EncodedImage> encoded_images = {};
std::vector<std::vector<EncodedImage>> encoded_videos = {};

if (m_is_chat_conversation) {
OPENVINO_ASSERT(1 == prompts.size(), "Can't chat with multiple prompts");
const auto& rgbs = rgbs_vector[0];
const auto& prompt = prompts[0];
auto start_get_inputs_embeds = std::chrono::steady_clock::now();
encoded_images = m_inputs_embedder->encode_images(rgbs);

auto video_rgbs = video_vector.size() > 0 ? video_vector[0] : std::vector<ov::Tensor>{};
for (auto& vd : video_rgbs) {
auto encoded_vd = m_inputs_embedder->encode_video({vd});
m_history_videos.push_back(encoded_vd);
encoded_videos.push_back(encoded_vd);
}

auto image_rgbs = rgbs_vector.size() > 0 ? rgbs_vector[0] : std::vector<ov::Tensor>{};
encoded_images = m_inputs_embedder->encode_images(image_rgbs);
m_history_images.insert(m_history_images.end(), encoded_images.begin(), encoded_images.end());

const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images);
m_history.push_back({{"role", "user"}, {"content", unified_prompt}});
m_history_image_ids.insert(m_history_image_ids.end(), image_sequence.begin(), image_sequence.end());
auto norm_prompt = m_inputs_embedder->normalize_prompt(prompt, m_image_id, m_video_id, encoded_images, encoded_videos);
m_history.push_back({{"role", "user"}, {"content", norm_prompt.unified_prompt}});
m_history_image_ids.insert(m_history_image_ids.end(), norm_prompt.images_sequence.begin(), norm_prompt.images_sequence.end());
m_history_video_ids.insert(m_history_video_ids.end(), norm_prompt.videos_sequence.begin(), norm_prompt.videos_sequence.end());

std::string templated_history = m_tokenizer.apply_chat_template(m_history, true);

m_inputs_embedder->set_apply_chat_template_status(false);

if (m_inputs_embedder->has_token_type_ids()) {
auto [embeds, tt_ids] = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(templated_history, m_history_images, vlm_perf_metrics[0], rgbs.size() > 0, m_history_image_ids);
auto [embeds, tt_ids] = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(templated_history,
m_history_images,
m_history_videos,
vlm_perf_metrics[0],
true,
m_history_image_ids,
m_history_video_ids);
input_embeds_list.push_back(std::move(embeds));
token_type_ids_list.push_back(std::move(tt_ids));
} else {
input_embeds_list.emplace_back(m_inputs_embedder->get_inputs_embeds(templated_history, m_history_images, vlm_perf_metrics[0], rgbs.size() > 0, m_history_image_ids));
input_embeds_list.emplace_back(m_inputs_embedder->get_inputs_embeds(templated_history,
m_history_images,
m_history_videos,
vlm_perf_metrics[0],
true,
m_history_image_ids,
m_history_video_ids));
}

auto end_get_inputs_embeds = std::chrono::steady_clock::now();
Expand All @@ -193,19 +228,41 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
} else {
for (size_t i = 0; i < prompts.size(); i++) {
const auto& prompt = prompts[i];
const auto& rgbs = rgbs_vector[i];

auto start_get_inputs_embeds = std::chrono::steady_clock::now();
const auto encoded_images = m_inputs_embedder->encode_images(rgbs);
auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images);

auto image_rgbs = rgbs_vector.size() > 0 ? rgbs_vector[i] : std::vector<ov::Tensor>{};
auto video_rgbs = video_vector.size() > 0 ? video_vector[i] : std::vector<ov::Tensor>{};
const auto encoded_images = m_inputs_embedder->encode_images(image_rgbs);
std::vector<std::vector<ov::genai::EncodedImage>> encoded_videos;
for (auto& vd : video_rgbs) {
auto encoded_vd = m_inputs_embedder->encode_video({vd});
encoded_videos.push_back(encoded_vd);
}

auto norm_prompt = m_inputs_embedder->normalize_prompt(prompt, m_image_id, m_video_id, encoded_images, encoded_videos);

m_inputs_embedder->set_apply_chat_template_status(sampling_params[i].apply_chat_template);

if (m_inputs_embedder->has_token_type_ids()) {
auto [embeds, tt_ids] = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(unified_prompt, encoded_images, vlm_perf_metrics[i], true, image_sequence);
auto [embeds, tt_ids] =
m_inputs_embedder->get_inputs_embeds_with_token_type_ids(norm_prompt.unified_prompt,
encoded_images,
encoded_videos,
vlm_perf_metrics[i],
true,
norm_prompt.images_sequence,
norm_prompt.videos_sequence);
input_embeds_list.push_back(std::move(embeds));
token_type_ids_list.push_back(std::move(tt_ids));
} else {
input_embeds_list.emplace_back(m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, vlm_perf_metrics[i], true, image_sequence));
input_embeds_list.emplace_back(m_inputs_embedder->get_inputs_embeds(norm_prompt.unified_prompt,
encoded_images,
encoded_videos,
vlm_perf_metrics[i],
true,
norm_prompt.images_sequence,
norm_prompt.videos_sequence));
}

auto end_get_inputs_embeds = std::chrono::steady_clock::now();
Expand Down Expand Up @@ -240,13 +297,16 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
m_inputs_embedder->update_chat_history(results[0].texts[0], encoded_results[0].m_status);
if (encoded_results[0].m_status != ov::genai::GenerationStatus::CANCEL) {
m_image_id += encoded_images.size();
m_video_id += encoded_videos.size();
m_history.push_back({{"role", "assistant"}, {"content", results[0].texts[0]}});
}
else {
m_history.pop_back();
for (size_t idx = 0; idx < encoded_images.size(); idx++) {
m_history_image_ids.pop_back();
m_history_images.pop_back();
m_history_video_ids.pop_back();
m_history_videos.pop_back();
}
}
}
Expand All @@ -264,14 +324,42 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(uint64_t re
{
std::lock_guard<std::mutex> lock(m_embeddings_mutex);
m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template);
const auto encoded_images = m_inputs_embedder->encode_images(rgbs);

auto encoded_images = m_inputs_embedder->encode_images(rgbs);

const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images);
inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence);
}
return add_request(request_id, inputs, sampling_params);
}

GenerationHandle ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(
uint64_t request_id,
const std::string& prompt,
const std::vector<ov::Tensor>& images,
const std::vector<ov::Tensor>& videos,
GenerationConfig sampling_params) {
OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS, "Model doesn't support embeddings.");

ov::genai::VLMPerfMetrics metrics;
ov::Tensor inputs;
{
std::lock_guard<std::mutex> lock(m_embeddings_mutex);
m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template);

auto encoded_images = m_inputs_embedder->encode_images(images);
std::vector<std::vector<ov::genai::EncodedImage>> encoded_videos;
for (auto& vd : videos) {
auto encoded_vd = m_inputs_embedder->encode_video({vd});
encoded_videos.push_back(encoded_vd);
}

auto norm_prompt = m_inputs_embedder->normalize_prompt(prompt, 0, 0, encoded_images, encoded_videos);
inputs = m_inputs_embedder->get_inputs_embeds(norm_prompt.unified_prompt, encoded_images, metrics, true, norm_prompt.images_sequence);
}
return add_request(request_id, inputs, sampling_params);
}

void ContinuousBatchingPipeline::IContinuousBatchingPipeline::stream_tokens(
const std::shared_ptr<ThreadedStreamerWrapper>& streamer_ptr,
const GenerationHandle& handle
Expand Down
Loading
Loading