sgl-project · CatherineSue · Aug 11, 2025 · Jul 11, 2025 · Jul 14, 2025 · Aug 8, 2025
@@ -3,6 +3,8 @@ omit =
     genai_bench/cli/report.py
     genai_bench/analysis/excel_report.py
     genai_bench/analysis/plot_report.py
+    genai_bench/analysis/flexible_plot_report.py
+    genai_bench/analysis/plot_config.py
     genai_bench/ui/*
     genai_bench/logging.py
     tests/*

@@ -3,12 +3,12 @@
 ## Quick Start
 You can check out `genai-bench plot --help` to find how to generate a 2x4 Plot containing:
 
-1. Output Inference Speed (tokens/s) vs Output Throughput of Server (tokens/s)
-2. TTFT (s) vs Output Throughput of Server (tokens/s)
+1. Per-Request Inference Speed (tokens/s) vs Server Output Throughput (tokens/s)
+2. TTFT (s) vs Server Output Throughput (tokens/s)
 3. Mean E2E Latency (s) per Request vs RPS
 4. Error Rates by HTTP Status vs Concurrency
-5. Output Inference Speed per Request (tokens/s) vs Total Throughput (Input + Output) of Server (tokens/s)
-6. TTFT (s) vs Total Throughput (Input + Output) of Server (tokens/s)
+5. Per-Request Inference Speed (tokens/s) vs Server Total Throughput (Input + Output) (tokens/s)
+6. TTFT (s) vs Server Total Throughput (Input + Output) (tokens/s)
 7. P90 E2E Latency (s) per Request vs RPS
 8. P99 E2E Latency (s) per Request vs RPS
 

@@ -176,7 +176,7 @@ genai-bench benchmark \
   --max-requests-per-run 100 \
   --max-time-per-run 10
 ```
-**Note:** for Dedicated model, the `--api-model-name` is just a placeholder, the model depends on the the endpointId you provided 
+**Note:** for Dedicated model, the `--api-model-name` is just a placeholder, the model depends on the the endpointId you provided
 
 **Advanced features:**
 ```bash
@@ -343,7 +343,7 @@ vLLM and SGLang use OpenAI-compatible APIs with optional authentication.
 **Example:**
 ```bash
 genai-bench benchmark \
-  --api-backend vllm \
+  --api-backend sglang \
   --api-base http://localhost:8000 \
   --api-key optional-key \
   --api-model-name meta-llama/Llama-2-7b-hf \
@@ -657,4 +657,4 @@ The main changes are:
 
 - `--bucket` → `--storage-bucket`
 - `--prefix` → `--storage-prefix`
-- Add `--storage-provider oci` (though OCI is the default for backward compatibility)
+- Add `--storage-provider oci` (though OCI is the default for backward compatibility)
@@ -2,7 +2,7 @@
 
 This is a quick reference guide for common multi-cloud scenarios with genai-bench. For detailed information, see the [comprehensive guide](multi-cloud-auth-storage.md).
 
-> **Note**: For OpenAI, vLLM, and SGLang backends, both `--api-key` and `--model-api-key` are supported for backward compatibility.
+> **Note**: For OpenAI, SGLang and vLLM backends, both `--api-key` and `--model-api-key` are supported for backward compatibility.
 
 ## OpenAI Benchmarking
 
@@ -277,4 +277,4 @@ export GITHUB_REPO=benchmarks
 ```bash
 # HuggingFace (for downloading tokenizers)
 export HF_TOKEN=hf_...
-```
+```
@@ -21,12 +21,12 @@ export TRANSFORMERS_VERBOSITY=error
 genai-bench benchmark --api-backend openai \
             --api-base "http://localhost:8082" \
             --api-key "your-openai-api-key" \
-            --api-model-name "vllm-model" \
+            --api-model-name "meta-llama/Meta-Llama-3-70B-Instruct" \
             --model-tokenizer "/mnt/data/models/Meta-Llama-3.1-70B-Instruct" \
             --task text-to-text \
             --max-time-per-run 15 \
             --max-requests-per-run 300 \
-            --server-engine "vLLM" \
+            --server-engine "SGLang" \
             --server-gpu-type "H100" \
             --server-version "v0.6.0" \
             --server-gpu-count 4
@@ -119,7 +119,7 @@ genai-bench benchmark --api-backend oci-cohere \
             --api-base "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com" \
             --api-model-name "c4ai-command-r-08-2024" \
             --model-tokenizer "/home/ubuntu/c4ai-command-r-08-2024" \
-            --server-engine "vLLM" \
+            --server-engine "SGLang" \
             --task text-to-text \
             --num-concurrency 1 \
             --server-gpu-type A100-80G \
@@ -344,4 +344,4 @@ If you want to benchmark a specific portion of a vision dataset, you can use the
 - Access to ALL HuggingFace `load_dataset` parameters
 - Reusable and version-controllable
 - Support for complex configurations
-- Future-proof (no CLI updates needed for new HuggingFace features)
+- Future-proof (no CLI updates needed for new HuggingFace features)
@@ -16,12 +16,12 @@ To enable result uploading, use the following options with the `benchmark` comma
 genai-bench benchmark \
     --api-base "http://localhost:8082" \
     --api-key "your-openai-api-key" \
-    --api-model-name "vllm-model" \
+    --api-model-name "meta-llama/Meta-Llama-3-70B-Instruct" \
     --model-tokenizer "/mnt/data/models/Meta-Llama-3.1-70B-Instruct" \
     --task text-to-text \
     --max-time-per-run 15 \
     --max-requests-per-run 300 \
-    --server-engine "vLLM" \
+    --server-engine "SGLang" \
     --server-gpu-type "H100" \
     --server-version "v0.6.0" \
     --server-gpu-count 4 \
@@ -44,4 +44,4 @@ GenAI Bench now supports multiple cloud storage providers:
 - **GCP Cloud Storage**: Use `--storage-provider gcp`
 - **GitHub Releases**: Use `--storage-provider github`
 
-For detailed configuration and authentication options for each provider, please refer to the [Multi-Cloud Authentication & Storage Guide](multi-cloud-auth-storage.md).
+For detailed configuration and authentication options for each provider, please refer to the [Multi-Cloud Authentication & Storage Guide](multi-cloud-auth-storage.md).
@@ -11,7 +11,7 @@
 LoggingManager("excel")
 
 
-folder_name = "/Users/changsu/openai_chat_vllm-model_tokenizer__mnt_data_models_Llama-3-70B-Instruct_20240904_003850"  # noqa: E501
+folder_name = "/Users/changsu/openai_chat_sglang-model_tokenizer__mnt_data_models_Llama-3-70B-Instruct_20240904_003850"  # noqa: E501
 os.makedirs(folder_name, exist_ok=True)
 experiment_metadata, run_data = load_one_experiment(folder_name)
 create_workbook(

@@ -6,16 +6,16 @@
     load_multiple_experiments,
     load_one_experiment,
 )
-from genai_bench.analysis.plot_report import plot_experiment_data
+from genai_bench.analysis.flexible_plot_report import plot_experiment_data_flexible
 from genai_bench.logging import LoggingManager
 
 LoggingManager("plot")
 
 
 # Example usage with filtering multiple experiments
-folder_name = "/Users/changsu/experiment_plot"
+folder_name = "<Path to the experiment folder>"
 filter_criteria = {
-    "model": "vllm-model",
+    "model": "Llama-4-Scout-17B-16E-Instruct",
 }
 
 os.makedirs(folder_name, exist_ok=True)
@@ -26,20 +26,20 @@
     print("Empty data after filtering")
 else:
     # Plot the data grouped by 'server_version'
-    plot_experiment_data(
+    plot_experiment_data_flexible(
         run_data_list, group_key="server_version", experiment_folder=folder_name
     )
 
 # Plot for one experiment
 experiment_folder = os.path.join(
     folder_name,
-    "openai_chat_vllm-model_tokenizer__mnt_data_models_Llama-3-70B-Instruct_20240904_003850",
+    "openai_SGLang_v0.4.7.post1_text-to-text_Llama-4-Scout-17B-16E-Instruct_20250620_042005",
 )
 experiment_metadata, run_data = load_one_experiment(experiment_folder)
 if not experiment_metadata or not run_data:
     print("Didn't find any experiment data")
 else:
-    plot_experiment_data(
+    plot_experiment_data_flexible(
         [
             [experiment_metadata, run_data],
         ],

@@ -195,17 +195,17 @@ def _create_appendix_sheet_common(
             [
                 "End-to-End Latency per Request (s)",
                 "Request Throughput (RPS)",
-                "Total Throughput (Input + Output) of Server (tokens/s)",
+                "Server Total Throughput (Input + Output) (tokens/s)",
             ]
         )
     else:
         headers.extend(
             [
-                "Output Inference Speed per Request (tokens/s)",
-                "Output Throughput of Server (tokens/s)",
+                "Per-Request Inference Speed (tokens/s)",
+                "Server Output Throughput (tokens/s)",
                 "End-to-End Latency per Request (s)",
                 "Request Throughput (RPS)",
-                "Total Throughput (Input + Output) of Server (tokens/s)",
+                "Server Total Throughput (Input + Output) (tokens/s)",
             ]
         )
 

@@ -138,20 +138,20 @@ class PlotConfigManager:
             "layout": {"rows": 2, "cols": 4, "figsize": [32, 12]},
             "plots": [
                 {
-                    "title": "Output Inference Speed per Request vs "
-                    "Output Throughput of Server",
+                    "title": "Per-Request Inference Speed vs "
+                    "Server Output Throughput",
                     "x_field": "mean_output_throughput_tokens_per_s",
                     "y_field": "stats.output_inference_speed.mean",
-                    "x_label": "Output Throughput of Server (tokens/s)",
-                    "y_label": "Output Inference Speed per Request (tokens/s)",
+                    "x_label": "Server Output Throughput (tokens/s)",
+                    "y_label": "Per-Request Inference Speed (tokens/s)",
                     "plot_type": "line",
                     "position": [0, 0],
                 },
                 {
-                    "title": "TTFT vs Output Throughput of Server",
+                    "title": "TTFT vs Server Output Throughput",
                     "x_field": "mean_output_throughput_tokens_per_s",
                     "y_field": "stats.ttft.mean",
-                    "x_label": "Output Throughput of Server (tokens/s)",
+                    "x_label": "Server Output Throughput (tokens/s)",
                     "y_label": "TTFT",
                     "plot_type": "line",
                     "position": [0, 1],
@@ -175,20 +175,20 @@ class PlotConfigManager:
                     "position": [0, 3],
                 },
                 {
-                    "title": "Output Inference Speed per Request vs "
-                    "Total Throughput (Input + Output) of Server",
+                    "title": "Per-Request Inference Speed vs "
+                    "Server Total Throughput (Input + Output)",
                     "x_field": "mean_total_tokens_throughput_tokens_per_s",
                     "y_field": "stats.output_inference_speed.mean",
-                    "x_label": "Total Throughput (Input + Output) of Server (tokens/s)",
-                    "y_label": "Output Inference Speed per Request (tokens/s)",
+                    "x_label": "Server Total Throughput (Input + Output) (tokens/s)",
+                    "y_label": "Per-Request Inference Speed (tokens/s)",
                     "plot_type": "line",
                     "position": [1, 0],
                 },
                 {
-                    "title": "TTFT vs Total Throughput (Input + Output) of Server",
+                    "title": "TTFT vs Server Total Throughput (Input + Output)",
                     "x_field": "mean_total_tokens_throughput_tokens_per_s",
                     "y_field": "stats.ttft.mean",
-                    "x_label": "Total Throughput (Input + Output) of Server (tokens/s)",
+                    "x_label": "Server Total Throughput (Input + Output) (tokens/s)",
                     "y_label": "TTFT",
                     "plot_type": "line",
                     "position": [1, 1],

@@ -51,25 +51,9 @@ def plot_graph(
     else:
         x_positions = x_data  # type: ignore[assignment]
 
-    # If this is TTFT or E2E latency, filter out values outside [0.1, 100]
-    valid_x = []
-    valid_y = []
-    valid_concurrency = []
-
-    should_cap = any(
-        kw in y_label.lower() for kw in ["ttft", "mean e2e", "p90 e2e", "p99 e2e"]
-    )
-
-    if should_cap:
-        for xx, yy, cc in zip(x_data, y_data, concurrency_levels, strict=False):
-            if 0.1 <= yy <= 100:
-                valid_x.append(xx)
-                valid_y.append(yy)
-                valid_concurrency.append(cc)
-    else:
-        valid_x = x_data
-        valid_y = y_data
-        valid_concurrency = concurrency_levels
+    valid_x = x_data
+    valid_y = y_data
+    valid_concurrency = concurrency_levels
 
     # Plot data
     if plot_type == "line":
@@ -101,11 +85,8 @@ def plot_graph(
             mticker.LogLocator(base=10.0, subs=np.arange(2, 10) * 0.1, numticks=100)
         )
 
-    # Cap the y-limits if needed
-    if should_cap:
-        ax.set_ylim([0.1, 100])
-    else:
-        ax.set_ylim(bottom=0)
+    ax.set_ylim(bottom=0)
+    ax.set_xlim(left=0)
 
     ax.set_xlabel(x_label)
     ax.set_ylabel(y_label)
@@ -156,10 +137,9 @@ def plot_metrics(
                     ].mean_output_throughput_tokens_per_s
                     for c in concurrency_levels
                 ],
-                "x_label": "Output Throughput of Server (tokens/s)",
-                "y_label": "Output Inference Speed per Request (tokens/s)",
-                "title": "Output Inference Speed per Request "
-                "vs Output Throughput of Server",
+                "x_label": "Server Output Throughput (tokens/s)",
+                "y_label": "Per-Request Inference Speed (tokens/s)",
+                "title": "Per-Request Inference Speed vs Server Output Throughput",
                 "plot_type": "line",
                 "ax": axs[0, 0],
             },
@@ -174,9 +154,9 @@ def plot_metrics(
                     ].mean_output_throughput_tokens_per_s
                     for c in concurrency_levels
                 ],
-                "x_label": "Output Throughput of Server (tokens/s)",
+                "x_label": "Server Output Throughput (tokens/s)",
                 "y_label": "TTFT",
-                "title": "TTFT vs Output Throughput of Server",
+                "title": "TTFT vs Server Output Throughput",
                 "plot_type": "line",
                 "ax": axs[0, 1],
             },
@@ -209,10 +189,10 @@ def plot_metrics(
                     ].mean_total_tokens_throughput_tokens_per_s
                     for c in concurrency_levels
                 ],
-                "x_label": "Total Throughput (Input + Output) of Server (tokens/s)",
-                "y_label": "Output Inference Speed per Request (tokens/s)",
-                "title": "Output Inference Speed per Request vs "
-                "Total Throughput (Input + Output) of Server",
+                "x_label": "Server Total Throughput (Input + Output) (tokens/s)",
+                "y_label": "Per-Request Inference Speed (tokens/s)",
+                "title": "Per-Request Inference Speed vs "
+                "Server Total Throughput (Input + Output)",
                 "plot_type": "line",
                 "ax": axs[1, 0],
             },
@@ -227,9 +207,9 @@ def plot_metrics(
                     ].mean_total_tokens_throughput_tokens_per_s
                     for c in concurrency_levels
                 ],
-                "x_label": "Total Throughput (Input + Output) of Server (tokens/s)",
+                "x_label": "Server Total Throughput (Input + Output) (tokens/s)",
                 "y_label": "TTFT",
-                "title": "TTFT vs Total Throughput (Input + Output) of Server",
+                "title": "TTFT vs Server Total Throughput (Input + Output)",
                 "plot_type": "line",
                 "ax": axs[1, 1],
             },
@@ -633,10 +613,10 @@ def plot_single_scenario_inference_speed_vs_throughput(
         ax=ax,
         x_data=valid_x_data,
         y_data=valid_y_data,
-        x_label="Output Throughput of Server (tokens/s)",
-        y_label="Output Inference Speed per Request (tokens/s)",
-        title=f"Output Inference Speed per Request vs "
-        f"Output Throughput of Server - {scenario_label}",
+        x_label="Server Output Throughput (tokens/s)",
+        y_label="Per-Request Inference Speed (tokens/s)",
+        title=f"Per-Request Inference Speed vs "
+        f"Server Output Throughput - {scenario_label}",
         concurrency_levels=valid_concurrency,
         label=f"Scenario: {scenario_label}",
         plot_type="line",
@@ -716,5 +696,6 @@ def plot_error_rates(
     ax.set_ylabel("Error Rate")
     ax.set_title("Error Rates by HTTP Status vs Concurrency")
     ax.set_ylim(bottom=0)
+    ax.set_xlim(left=0)
     ax.legend()
     ax.grid(True)
@@ -11,9 +11,10 @@
 from genai_bench.analysis.excel_report import create_workbook
 from genai_bench.analysis.experiment_loader import load_one_experiment
 from genai_bench.analysis.plot_report import (
-    plot_experiment_data,
     plot_single_scenario_inference_speed_vs_throughput,
 )
+from genai_bench.analysis.flexible_plot_report import plot_experiment_data_flexible
+
 from genai_bench.auth.unified_factory import UnifiedAuthFactory
 from genai_bench.cli.option_groups import (
     api_options,
@@ -506,7 +507,7 @@ def benchmark(
         ),
         percentile="mean",
     )
-    plot_experiment_data(
+    plot_experiment_data_flexible(
         [
             (experiment_metadata, run_data),
         ],

@@ -71,7 +71,7 @@ def excel(ctx, experiment_folder, excel_name, metric_percentile):
     default=None,
     callback=validate_filter_criteria,
     help="A dictionary containing filter criteria for the plot. Default: {}. "
-    "Example: '{'model': 'vllm-model'}'",
+    "Example: '{'model': 'meta-llama/Meta-Llama-3-70B-Instruct'}'",
 )
 @click.option(
     "--plot-config",