openvinotoolkit · sbalandi · Sep 25, 2025 · Sep 10, 2025 · Sep 16, 2025 · Sep 16, 2025
diff --git a/tests/python_tests/samples/test_tools_llm_benchmark.py b/tests/python_tests/samples/test_tools_llm_benchmark.py
@@ -234,3 +234,21 @@ def test_python_tool_llm_benchmark_text_embeddings(self, convert_model, sample_a
             "-m", convert_model, 
         ] + sample_args
         run_sample(benchmark_py_command)
+
+
+    @pytest.mark.samples
+    @pytest.mark.parametrize("convert_model", ["ms-marco-TinyBERT-L2-v2"], indirect=True)
+    @pytest.mark.parametrize("sample_args", [
+        ["-d", "cpu", "-n", "2", "--rerank"], 
+        ["-d", "cpu", "-n", "2", "--reranking_max_length", "10", "--reranking_top_n", "1", "--rerank"],
+        ["-d", "cpu", "-n", "2", "--optimum", "--rerank"], 
+        ["-d", "cpu", "-n", "1", "--reranking_max_length", "10", "--reranking_top_n", "1", "--optimum", "--rerank"]
+    ])
+    def test_python_tool_llm_benchmark_text_reranking(self, convert_model, sample_args):
+        benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py')
+        benchmark_py_command = [
+            sys.executable, 
+            benchmark_script, 
+            "-m", convert_model, 
+        ] + sample_args
+        run_sample(benchmark_py_command)
diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py
@@ -19,6 +19,7 @@
 import task.speech_to_text_generation as bench_speech
 import task.text_embeddings as bench_text_embed
 import task.text_to_speech_generation as bench_text_to_speech
+import task.text_reranker as bench_text_rerank
 
 DEFAULT_TORCH_THREAD_NUMS = 16
 
@@ -196,6 +197,16 @@ def get_argprser():
     parser.add_argument("--embedding_normalize", action="store_true", help="Normalize embeddings. Applicable only for text embeddings")
     parser.add_argument("--embedding_max_length", type=int, default=None,
                         help="Max length for text embeddings. Input text will be padded or truncated to specified value")
+    parser.add_argument("--reranking_max_length", type=int, default=None,
+                        help="Max length for text reranking. Input text will be padded or truncated to specified value")
+    parser.add_argument("--reranking_top_n", type=int, default=None,
+                        help="Number of top results to return for text reranking")
+    parser.add_argument("--texts", nargs='+', default=None,
+                        help="List of candidates for reranking based on their relevance to a prompt(query). Applicable for Text Rerank pipeline.")
+    parser.add_argument('--texts_file', nargs='+', default=None,
+                        help='Texts file(s) in jsonl format with candidates for reranking based on relevance to a prompt(query). '
+                        'Multiple files should be separated with space(s). Applicable for Text Rerank pipeline.')
+    parser.add_argument('--rerank', action='store_true', help='Benchmark reranking pipeline.')
     parser.add_argument("--apply_chat_template", action="store_true",
                         help="Apply chat template for LLM. By default chat template is not applied. It's better to use with --disable_prompt_permutation,"
                              " otherwise the prompt will be modified after applying the chat template, so the structure of chat template will not be kept.")
@@ -214,7 +225,8 @@ def get_argprser():
     'speech2text': bench_speech.run_speech_2_txt_benchmark,
     "vlm": bench_vlm.run_visual_language_generation_benchmark,
     "text_embed": bench_text_embed.run_text_embddings_benchmark,
-    "text2speech": bench_text_to_speech.run_text_2_speech_benchmark
+    "text2speech": bench_text_to_speech.run_text_2_speech_benchmark,
+    "text_rerank": bench_text_rerank.run_text_reranker_benchmark
 }
 
 

diff --git a/tools/llm_bench/llm_bench_utils/config_class.py b/tools/llm_bench/llm_bench_utils/config_class.py
@@ -9,7 +9,8 @@
     AutoModel,
     SpeechT5ForTextToSpeech,
     SpeechT5Processor,
-    SpeechT5HifiGan
+    SpeechT5HifiGan,
+    AutoModelForSequenceClassification
 )
 from diffusers.pipelines import DiffusionPipeline, LDMSuperResolutionPipeline
 from optimum.intel.openvino import (
@@ -21,7 +22,8 @@
     OVPipelineForInpainting,
     OVPipelineForImage2Image,
     OVModelForFeatureExtraction,
-    OVModelForTextToSpeechSeq2Seq
+    OVModelForTextToSpeechSeq2Seq,
+    OVModelForSequenceClassification
 )
 from llm_bench_utils.ov_model_classes import OVMPTModel, OVLDMSuperResolutionPipeline, OVChatGLMModel
 
@@ -31,7 +33,8 @@
     't5': AutoTokenizer,
     'blenderbot': AutoTokenizer,
     'falcon': AutoTokenizer,
-    'speecht5': SpeechT5Processor
+    'speecht5': SpeechT5Processor,
+    'bert': AutoTokenizer
 }
 
 TEXT_TO_SPEECH_VOCODER_CLS = SpeechT5HifiGan
@@ -42,6 +45,10 @@
 
 IMAGE_TO_IMAGE_GEN_CLS = OVPipelineForImage2Image
 
+TEXT_RERANK_GEN_CLS = OVModelForSequenceClassification
+
+TEXT_RERANK_PT_GEN_CLS = AutoModelForSequenceClassification
+
 OV_MODEL_CLASSES_MAPPING = {
     'decoder': OVModelForCausalLM,
     't5': OVModelForSeq2SeqLM,
@@ -136,7 +143,7 @@
         "gptj"
     ],
     'ldm_super_resolution': ['ldm-super-resolution'],
-    'text_embed': ["bge", "bert", "albert", "roberta", "xlm-roberta"],
+    'rag': ["bge", "bert", "albert", "roberta", "xlm-roberta"],
     'text2speech': ['speecht5'],
 }
 
@@ -148,7 +155,9 @@
     'code_gen': 'decoder',
     'ldm_super_resolution': 'ldm_super_resolution',
     "vlm": "vlm",
+    'rag': 'bert',
     'text_embed': 'bert',
+    'text_rerank': 'bert',
     'text2speech': 'speecht5',
 }
 

diff --git a/tools/llm_bench/llm_bench_utils/hook_common.py b/tools/llm_bench/llm_bench_utils/hook_common.py
@@ -9,13 +9,13 @@
 TRANS_MIN_VERSION = '4.40.0'
 
 
-def get_bench_hook(num_beams, ov_model, embed=False):
+def get_bench_hook(num_beams, ov_model, rag=False):
     min_version = version.parse(TRANS_MIN_VERSION)
     trans_version = version.parse(transformers.__version__)
     search_type = 'beam search' if num_beams > 1 else 'greedy search'
-    if embed:
+    if rag:
         import llm_bench_utils.hook_forward
-        bench_hook = llm_bench_utils.hook_forward.EmbedForwardHook()
+        bench_hook = llm_bench_utils.hook_forward.RAGForwardHook()
         bench_hook.new_forward(ov_model)
         return bench_hook
 

diff --git a/tools/llm_bench/llm_bench_utils/hook_forward.py b/tools/llm_bench/llm_bench_utils/hook_forward.py
@@ -108,7 +108,7 @@ def my_vae_decoder(inputs, share_inputs=True, **kwargs):
         pipe.vae_decoder.request = my_vae_decoder
 
 
-class EmbedForwardHook:
+class RAGForwardHook:
     def __init__(self):
         self.tm_list = []
         self.tm_infer_list = []

diff --git a/tools/llm_bench/llm_bench_utils/metrics_print.py b/tools/llm_bench/llm_bench_utils/metrics_print.py
@@ -4,10 +4,10 @@
 import logging as log
 
 
-def print_metrics(
-        iter_num, iter_data, tms=None, tms_infer=None, warm_up=False,
-        stable_diffusion=None, tokenization_time=None, batch_size=1, prompt_idx=-1, whisper=None, text_emb=None, latency_unit=None, tts=None, cb_metric=None
-):
+def print_metrics(iter_num, iter_data, tms=None, tms_infer=None, warm_up=False,
+                  stable_diffusion=None, tokenization_time=None, batch_size=1,
+                  prompt_idx=-1, whisper=None, text_emb=None, latency_unit=None,
+                  tts=None, cb_metric=None, text_rerank=None):
     iter_str = str(iter_num)
     if warm_up:
         iter_str = 'warm-up'
@@ -42,15 +42,21 @@ def print_metrics(
         iter_data['other_tokens_avg_latency'] = sum(tms[1:]) / (len(tms) - 1) * 1000 if len(tms) > 1 else -1
         first_token_latency = 'NA' if iter_data['first_token_latency'] == -1 else f"{iter_data['first_token_latency']:.2f} ms"
         other_token_latency = 'NA' if iter_data['other_tokens_avg_latency'] == -1 else f"{iter_data['other_tokens_avg_latency']:.2f} ms/{latency_unit}"
-        if text_emb is None:
+        if text_emb is not None:
             log.info(
-                f'{prefix} First token latency: {first_token_latency}, '
-                f'other tokens latency: {other_token_latency}, len of input tokens: {iter_data["input_size"]} * {batch_size}',
+                f"{prefix} First iteration latency: {first_token_latency}, "
+                f'other iterations latency: {other_token_latency}, len of input tokens: {iter_data["input_size"]} * {batch_size}',
+            )
+        elif text_rerank is not None:
+            log.info(
+                f"{prefix} First iteration latency: {first_token_latency}, "
+                f'other iteration latency: {other_token_latency}, len of input tokens: {iter_data["input_size"]}, '
+                f'texts number: {text_rerank.get("texts_num", -1)}',
             )
         else:
             log.info(
-                f'{prefix} First iteration latency: {first_token_latency}, '
-                f'other iterations latency: {other_token_latency}, len of input tokens: {iter_data["input_size"]} * {batch_size}',
+                f"{prefix} First token latency: {first_token_latency}, "
+                f'other tokens latency: {other_token_latency}, len of input tokens: {iter_data["input_size"]} * {batch_size}',
             )
         if len(tms) == 0:
             log.warning(f'{prefix} No hook data output for first token latency and other tokens latency')

diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py
@@ -42,6 +42,8 @@ def get_param_from_file(args, input_key):
             if args[input_key] is None:
                 if args['use_case'] in ['text_gen', 'text_embed', 'text2speech']:
                     data_list.append('What is OpenVINO?')
+                elif args['use_case'] in ['text_rerank']:
+                    data_list.append("What are the main features of Intel Core Ultra processors?")
                 elif args['use_case'] == 'code_gen':
                     data_list.append('def print_hello_world():')
                 elif args['use_case'] == 'image_gen':
@@ -137,6 +139,11 @@ def analyze_args(args):
     model_args['emb_pooling_type'] = args.embedding_pooling
     model_args['emb_normalize'] = args.embedding_normalize
     model_args["emb_max_length"] = args.embedding_max_length
+    model_args['rerank_max_length'] = args.reranking_max_length
+    model_args["rerank_top_n"] = args.reranking_top_n
+    model_args["rerank_texts"] = args.texts
+    model_args["rerank_texts_file"] = args.texts_file
+    model_args["rerank"] = args.rerank
     model_args["apply_chat_template"] = args.apply_chat_template
 
     optimum = args.optimum
@@ -177,6 +184,8 @@ def analyze_args(args):
         raise RuntimeError(f'==Failure FOUND==: Incorrect model path:{model_path}')
     if model_framework in ('ov', 'pt'):
         use_case, model_name = get_use_case(args.model)
+        if use_case == 'rag':
+            use_case = 'text_rerank' if args.rerank else 'text_embed'
     model_args['use_case'] = use_case
     if use_case == 'code_gen' and not model_args['prompt'] and not model_args['prompt_file']:
         model_args['prompt'] = 'def print_hello_world():'

diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
@@ -11,6 +11,7 @@
 import copy
 import types
 from llm_bench_utils.hook_common import get_bench_hook
+from llm_bench_utils.memory_monitor import MemMonitorWrapper
 from llm_bench_utils.hook_forward import MeanStdPair, RawImGenPerfMetrics
 from llm_bench_utils.model_utils import get_version_in_format_to_pars
 from llm_bench_utils.config_class import (
@@ -21,6 +22,7 @@
     INPAINTING_IMAGE_GEN_CLS,
     IMAGE_TO_IMAGE_GEN_CLS,
     TEXT_TO_SPEECH_VOCODER_CLS,
+    TEXT_RERANK_GEN_CLS,
     PA_ATTENTION_BACKEND
 )
 from transformers import pipeline
@@ -764,7 +766,7 @@ def forward_with_pooling(self, input_ids, attention_mask, token_type_ids=None, *
     if kwargs.get("mem_consumption"):
         memory_data_collector.stop_and_collect_data('compilation_phase')
         memory_data_collector.log_data(compilation_phase=True)
-    bench_hook = get_bench_hook(1, ov_model, embed=True)
+    bench_hook = get_bench_hook(1, ov_model, rag=True)
     from_pretrained_time = end - start
     log.info(f'From pretrained time: {from_pretrained_time:.2f}s')
     return ov_model, tokenizer, from_pretrained_time, bench_hook, False
@@ -1144,3 +1146,77 @@ def _is_chinese_char(self, cp):
         ):  #
             return True
         return False
+
+
+def create_genai_text_reranker_model(model_path: Path, device: str, memory_monitor: MemMonitorWrapper, tokenizer: AutoTokenizer, **kwargs):
+    import openvino_genai
+
+    config = openvino_genai.TextRerankPipeline.Config()
+    if kwargs.get("rerank_top_n") is not None:
+        config.top_n = kwargs.get("rerank_top_n")
+    if kwargs.get("rerank_max_length") is not None:
+        config.max_length = kwargs.get("rerank_max_length")
+
+    ov_config = kwargs['config']
+
+    if kwargs.get("mem_consumption"):
+        memory_monitor.start()
+    start = time.perf_counter()
+    pipe = openvino_genai.TextRerankPipeline(model_path, device.upper(), config, **ov_config)
+    end = time.perf_counter()
+
+    log.info("Selected OpenVINO GenAI for benchmarking")
+    if kwargs.get("mem_consumption"):
+        memory_monitor.stop_and_collect_data('compilation_phase')
+        memory_monitor.log_data('for compilation phase')
+    log.info(f'Pipeline initialization time: {end - start:.2f}s')
+    return pipe, tokenizer, end - start, None, True
+
+
+def create_text_reranker_model(model_path: Path, device: str, memory_monitor: MemMonitorWrapper, **kwargs):
+    if model_path.name.endswith('xml'):
+        model_path = model_path.parents[2]
+
+    ov_config = kwargs['config']
+
+    model_path_existed = Path(model_path).exists()
+    # load model
+    if not model_path_existed:
+        raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist')
+
+    trust_remote_code = False
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+    except Exception:
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        trust_remote_code = True
+    if kwargs.get("genai", True) and is_genai_available(log_msg=True):
+        try:
+            return create_genai_text_reranker_model(model_path, device, memory_monitor, tokenizer, **kwargs)
+        except Exception as exp:
+            log.warning(
+                f"Model is not supported by OpenVINO GenAI. "
+                f"GenAI pipeline loading failed with following error: {exp}"
+                "Benchmark will be switched to Optimum Intel pipeline realization"
+            )
+
+        log.info("Selected Optimum Intel for benchmarking")
+    if kwargs.get("mem_consumption"):
+        memory_monitor.start()
+    start = time.perf_counter()
+    ov_model = TEXT_RERANK_GEN_CLS.from_pretrained(
+        model_path,
+        device=device,
+        ov_config=ov_config,
+        trust_remote_code=trust_remote_code,
+        use_cache=False
+    )
+    end = time.perf_counter()
+
+    if kwargs.get("mem_consumption"):
+        memory_monitor.stop_and_collect_data('compilation_phase')
+        memory_monitor.log_data('for compilation phase')
+    bench_hook = get_bench_hook(1, ov_model, rag=True)
+    from_pretrained_time = end - start
+    log.info(f'From pretrained time: {from_pretrained_time:.2f}s')
+    return ov_model, tokenizer, from_pretrained_time, bench_hook, False
diff --git a/tools/llm_bench/llm_bench_utils/pt_utils.py b/tools/llm_bench/llm_bench_utils/pt_utils.py
@@ -1,14 +1,22 @@
 # -*- coding: utf-8 -*-
 # Copyright (C) 2023-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-from pathlib import Path
-import torch
-from llm_bench_utils.config_class import PT_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES, TEXT_TO_SPEECH_VOCODER_CLS
 import os
 import time
+import torch
+import json
 import logging as log
+from pathlib import Path
+
+from llm_bench_utils.memory_monitor import MemMonitorWrapper
+from llm_bench_utils.config_class import (
+    PT_MODEL_CLASSES_MAPPING,
+    TOKENIZE_CLASSES_MAPPING,
+    DEFAULT_MODEL_CLASSES,
+    TEXT_TO_SPEECH_VOCODER_CLS,
+    TEXT_RERANK_PT_GEN_CLS
+)
 import llm_bench_utils.hook_common as hook_common
-import json
 
 
 def set_bf16(model, device, **kwargs):
@@ -268,3 +276,45 @@ def create_ldm_super_resolution_model(model_path, device, memory_data_collector,
         compiled_model = run_torch_compile(pipe, backend, memory_data_collector if kwargs.get("mem_consumption") else None)
         pipe = compiled_model
     return pipe, from_pretrain_time
+
+
+def create_text_reranker_model(model_path: Path, device: str, memory_monitor: MemMonitorWrapper, **kwargs):
+    if not model_path.exists():
+        raise RuntimeError(f'==Failure ==: model path:{model_path} is not exist')
+    if not device:
+        raise RuntimeError('==Failure ==: no device to load')
+    if not model_path.is_dir() or len(os.listdir(model_path)) == 0:
+        raise RuntimeError(f'==Failure ==: model path:{model_path} is not directory or directory is empty')
+
+    log.info(f'Load text reranker model from model path:{model_path}')
+    default_model_type = DEFAULT_MODEL_CLASSES[kwargs['use_case']]
+    model_type = DEFAULT_MODEL_CLASSES[kwargs['use_case']]
+    token_class = TOKENIZE_CLASSES_MAPPING.get(model_type, TOKENIZE_CLASSES_MAPPING[default_model_type])
+    if kwargs.get("mem_consumption"):
+        memory_monitor.start()
+    start = time.perf_counter()
+    pipe = TEXT_RERANK_PT_GEN_CLS.from_pretrained(model_path)
+    pipe = set_bf16(pipe, device, **kwargs)
+    end = time.perf_counter()
+    if kwargs.get("mem_consumption"):
+        memory_monitor.stop_and_collect_data('from_pretrained_phase')
+        memory_monitor.log_data('for from pretrained phase')
+    from_pretrain_time = end - start
+    processor = token_class.from_pretrained(model_path)
+    log.info(f'Model path:{model_path}, from pretrained time: {from_pretrain_time:.2f}s')
+
+    # If the device is set to GPU there's a need to substitute it with 'cuda' so it will be accepted by PyTorch
+    if device.upper() == 'GPU':
+        device = torch.device('cuda') if torch.cuda.is_available() else log.info('CUDA device is unavailable')
+    else:
+        device = torch.device(device.lower())
+    log.info(f'Torch device was set to: {device}')
+
+    pipe.to(device)
+
+    if kwargs['torch_compile_backend']:
+        backend = kwargs['torch_compile_backend']
+        compiled_model = run_torch_compile(pipe, backend, memory_monitor if kwargs.get("mem_consumption") else None)
+        pipe = compiled_model
+
+    return pipe, processor, from_pretrain_time, None, False
diff --git a/tools/llm_bench/prompts/texts_for_rerank.jsonl b/tools/llm_bench/prompts/texts_for_rerank.jsonl
@@ -0,0 +1,2 @@
+{"text": "The commercial PC market is propelled by premium computing solutions that drive user productivity and help service organizations protect and maintain devices. Corporations must empower mobile and hybrid workers while extracting value from artificial intelligence (AI) to improve business outcomes. Moreover, both public and private sectors must address sustainability initiatives pertaining to the full life cycle of computing fleets. An inflection point in computing architecture is needed to stay ahead of evolving requirements. Introducing Intel® Core™ Ultra Processors Intel® Core™ Ultra processors shape the future of commercial computing in four major ways: Power Efficiency The new product line features a holistic approach to powerefficiency that benefits mobile work. Substantial changes to the microarchitecture, manufacturing process, packaging technology, and power management software result in up to 40% lower processor power consumption for modern tasks such as video conferencing with a virtual camera.  Artificial Intelligence Intel Core Ultra processors incorporate an AI-optimized architecture that supports new user experiences and the next wave of commercial applications. The CPU, GPU, and the new neural processing unit (NPU) are all capable of executing AI tasks as directed by application developers. For example, elevated mobile collaboration is possible with support for AI assisted background blur, noise suppression, eye tracking, and picture framing. Intel Core Ultra processors are capable of up to 2.5x the AI inference performance per watt as compared to Intel’s previous mobile processor offering."}
+{"text": "Intel Core Ultra processors incorporate an AI-optimized architecture that supports new user experiences and the next wave of commercial applications."}
diff --git a/tools/llm_bench/requirements.txt b/tools/llm_bench/requirements.txt
@@ -17,3 +17,4 @@ tiktoken
 librosa # For Whisper
 matplotlib
 jinja2>=3.1.0
+scipy
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"text": "The commercial PC market is propelled by premium computing solutions that drive user productivity and help service organizations protect and maintain devices. Corporations must empower mobile and hybrid workers while extracting value from artificial intelligence (AI) to improve business outcomes. Moreover, both public and private sectors must address sustainability initiatives pertaining to the full life cycle of computing fleets. An inflection point in computing architecture is needed to stay ahead of evolving requirements. Introducing Intel® Core™ Ultra Processors Intel® Core™ Ultra processors shape the future of commercial computing in four major ways: Power Efficiency The new product line features a holistic approach to powerefficiency that benefits mobile work. Substantial changes to the microarchitecture, manufacturing process, packaging technology, and power management software result in up to 40% lower processor power consumption for modern tasks such as video conferencing with a virtual camera. Artificial Intelligence Intel Core Ultra processors incorporate an AI-optimized architecture that supports new user experiences and the next wave of commercial applications. The CPU, GPU, and the new neural processing unit (NPU) are all capable of executing AI tasks as directed by application developers. For example, elevated mobile collaboration is possible with support for AI assisted background blur, noise suppression, eye tracking, and picture framing. Intel Core Ultra processors are capable of up to 2.5x the AI inference performance per watt as compared to Intel’s previous mobile processor offering."}
		{"text": "Intel Core Ultra processors incorporate an AI-optimized architecture that supports new user experiences and the next wave of commercial applications."}