microsoft · tianleiwu · Nov 13, 2023 · Nov 13, 2023
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py
@@ -53,7 +53,9 @@
             f"Batch size {len(prompt)} is larger than allowed {max_batch_size}. If dynamic shape is used, then maximum batch size is 4"
         )
 
-    pipeline_info = PipelineInfo(args.version)
+    min_image_size = 512
+    max_image_size = 1024 if args.version in ["2.0", "2.1"] else 768
+    pipeline_info = PipelineInfo(args.version, min_image_size=min_image_size, max_image_size=max_image_size)
     pipeline = init_pipeline(Txt2ImgPipeline, pipeline_info, engine_type, args, max_batch_size, batch_size)
 
     if engine_type == EngineType.TRT:

diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py
@@ -46,11 +46,18 @@ def load_pipelines(args, batch_size):
     if batch_size > max_batch_size:
         raise ValueError(f"Batch size {batch_size} is larger than allowed {max_batch_size}.")
 
+    # For TensorRT,  performance of engine built with dynamic shape is very sensitive to the range of image size.
+    # Here, we reduce the range of image size for TensorRT to trade-off flexibility and performance.
+    min_image_size = 832 if args.engine != "ORT_CUDA" else 512
+    max_image_size = 1216 if args.engine != "ORT_CUDA" else 2048
+
     # No VAE decoder in base when it outputs latent instead of image.
-    base_info = PipelineInfo(args.version, use_vae=False, min_image_size=640, max_image_size=1536)
+    base_info = PipelineInfo(args.version, use_vae=False, min_image_size=min_image_size, max_image_size=max_image_size)
     base = init_pipeline(Txt2ImgXLPipeline, base_info, engine_type, args, max_batch_size, batch_size)
 
-    refiner_info = PipelineInfo(args.version, is_refiner=True, min_image_size=640, max_image_size=1536)
+    refiner_info = PipelineInfo(
+        args.version, is_refiner=True, min_image_size=min_image_size, max_image_size=max_image_size
+    )
     refiner = init_pipeline(Img2ImgXLPipeline, refiner_info, engine_type, args, max_batch_size, batch_size)
 
     if engine_type == EngineType.TRT:
@@ -139,52 +146,57 @@ def run_demo(args):
 
 
 def run_dynamic_shape_demo(args):
-    """Run demo of generating images with different size with list of prompts with ORT CUDA provider."""
+    """Run demo of generating images with different settings with ORT CUDA provider."""
     args.engine = "ORT_CUDA"
-    args.scheduler = "UniPC"
-    args.denoising_steps = 8
     args.disable_cuda_graph = True
+    base, refiner = load_pipelines(args, 1)
 
-    batch_size = args.repeat_prompt
-    base, refiner = load_pipelines(args, batch_size)
+    prompts = [
+        "starry night over Golden Gate Bridge by van gogh",
+        "beautiful photograph of Mt. Fuji during cherry blossom",
+        "little cute gremlin sitting on a bed, cinematic",
+        "cute grey cat with blue eyes, wearing a bowtie, acrylic painting",
+        "beautiful Renaissance Revival Estate, Hobbit-House, detailed painting, warm colors, 8k, trending on Artstation",
+        "blue owl, big green eyes, portrait, intricate metal design, unreal engine, octane render, realistic",
+    ]
 
-    image_sizes = [
-        (1024, 1024),
-        (1152, 896),
-        (896, 1152),
-        (1216, 832),
-        (832, 1216),
-        (1344, 768),
-        (768, 1344),
-        (1536, 640),
-        (640, 1536),
+    # batch size, height, width, scheduler, steps, prompt
+    configs = [
+        (1, 832, 1216, "UniPC", 8, prompts[0]),
+        (1, 1024, 1024, "DDIM", 24, prompts[1]),
+        (1, 1216, 832, "EulerA", 18, prompts[2]),
+        (2, 1344, 768, "DDIM", 30, prompts[3]),
+        (2, 640, 1536, "UniPC", 18, prompts[4]),
+        (2, 1152, 896, "EulerA", 30, prompts[5]),
     ]
 
-    # Warm up the pipelines. This only need once before serving.
+    # Warm up (for cudnn convolution algo search) once before serving.
     args.prompt = ["warm up"]
-    args.num_warmup_runs = 3
-    prompt, negative_prompt = repeat_prompt(args)
-    for height, width in image_sizes:
+    args.num_warmup_runs = 1
+    for batch_size, height, width, _, _, _ in configs:
+        args.batch_size = batch_size
         args.height = height
         args.width = width
-        print(f"\nWarm up pipelines for Batch_size={batch_size}, Height={height}, Width={width}")
+        print(f"\nWarm up batch_size={batch_size}, height={height}, width={width}")
+        prompt, negative_prompt = repeat_prompt(args)
         run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=True)
 
     # Run pipeline on a list of prompts.
-    prompts = [
-        "starry night over Golden Gate Bridge by van gogh",
-        "little cute gremlin sitting on a bed, cinematic",
-    ]
     args.num_warmup_runs = 0
-    for example_prompt in prompts:
+    for batch_size, height, width, scheduler, steps, example_prompt in configs:
         args.prompt = [example_prompt]
+        args.batch_size = batch_size
+        args.height = height
+        args.width = width
+        args.scheduler = scheduler
+        args.denoising_steps = steps
+        base.set_scheduler(scheduler)
+        refiner.set_scheduler(scheduler)
+        print(
+            f"\nbatch_size={batch_size}, height={height}, width={width}, scheduler={scheduler}, steps={steps}, prompt={example_prompt}"
+        )
         prompt, negative_prompt = repeat_prompt(args)
-
-        for height, width in image_sizes:
-            args.height = height
-            args.width = width
-            print(f"\nBatch_size={batch_size}, Height={height}, Width={width}, Prompt={example_prompt}")
-            run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=False)
+        run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=False)
 
     base.teardown()
     refiner.teardown()

diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py
@@ -68,7 +68,7 @@ def parse_arguments(is_xl: bool, description: str):
         "--scheduler",
         type=str,
         default="DDIM",
-        choices=["DDIM", "UniPC"] if is_xl else ["DDIM", "EulerA", "UniPC"],
+        choices=["DDIM", "EulerA", "UniPC"],
         help="Scheduler for diffusion process",
     )
 
@@ -84,7 +84,7 @@ def parse_arguments(is_xl: bool, description: str):
         "--negative-prompt", nargs="*", default=[""], help="Optional negative prompt(s) to guide the image generation."
     )
     parser.add_argument(
-        "--repeat-prompt",
+        "--batch-size",
         type=int,
         default=1,
         choices=[1, 2, 4, 8, 16],
@@ -194,7 +194,7 @@ def parse_arguments(is_xl: bool, description: str):
 def repeat_prompt(args):
     if not isinstance(args.prompt, list):
         raise ValueError(f"`prompt` must be of type `str` or `str` list, but is {type(args.prompt)}")
-    prompt = args.prompt * args.repeat_prompt
+    prompt = args.prompt * args.batch_size
 
     if not isinstance(args.negative_prompt, list):
         raise ValueError(
@@ -255,6 +255,7 @@ def init_pipeline(pipeline_class, pipeline_info, engine_type, args, max_batch_si
             static_image_shape=not args.build_dynamic_shape,
             max_workspace_size=0,
             device_id=torch.cuda.current_device(),
+            timing_cache=timing_cache,
         )
     elif engine_type == EngineType.TRT:
         # Load TensorRT engines and pytorch modules

diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py
@@ -90,13 +90,15 @@ def __init__(
         use_vae=False,
         min_image_size=256,
         max_image_size=1024,
+        use_fp16_vae=True,
     ):
         self.version = version
         self._is_inpaint = is_inpaint
         self._is_refiner = is_refiner
         self._use_vae = use_vae
         self._min_image_size = min_image_size
         self._max_image_size = max_image_size
+        self._use_fp16_vae = use_fp16_vae
         if is_refiner:
             assert self.is_xl()
 
@@ -127,6 +129,13 @@ def stages(self) -> List[str]:
     def vae_scaling_factor(self) -> float:
         return 0.13025 if self.is_xl() else 0.18215
 
+    def vae_torch_fallback(self) -> bool:
+        return self.is_xl() and not self._use_fp16_vae
+
+    def custom_fp16_vae(self) -> Optional[str]:
+        # For SD XL, use a VAE that fine-tuned to run in fp16 precision without generating NaNs
+        return "madebyollin/sdxl-vae-fp16-fix" if self._use_fp16_vae and self.is_xl() else None
+
     @staticmethod
     def supported_versions(is_xl: bool):
         return ["xl-1.0"] if is_xl else ["1.4", "1.5", "2.0-base", "2.0", "2.1", "2.1-base"]

diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_schedulers.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_schedulers.py
@@ -44,7 +44,6 @@ def __init__(
 
         alphas = 1.0 - betas
         self.alphas_cumprod = torch.cumprod(alphas, dim=0)
-
         # standard deviation of the initial noise distribution
         self.init_noise_sigma = 1.0
 
@@ -71,7 +70,7 @@ def configure(self):
         self.variance = torch.from_numpy(variance).to(self.device)
 
         timesteps = self.timesteps.long().cpu()
-        self.alphas_cumprod = self.alphas_cumprod[timesteps].to(self.device)
+        self.filtered_alphas_cumprod = self.alphas_cumprod[timesteps].to(self.device)
         self.final_alpha_cumprod = self.final_alpha_cumprod.to(self.device)
 
     def scale_model_input(self, sample: torch.FloatTensor, idx, *args, **kwargs) -> torch.FloatTensor:
@@ -124,9 +123,9 @@ def step(
         # - pred_prev_sample -> "x_t-1"
 
         prev_idx = idx + 1
-        alpha_prod_t = self.alphas_cumprod[idx]
+        alpha_prod_t = self.filtered_alphas_cumprod[idx]
         alpha_prod_t_prev = (
-            self.alphas_cumprod[prev_idx] if prev_idx < self.num_inference_steps else self.final_alpha_cumprod
+            self.filtered_alphas_cumprod[prev_idx] if prev_idx < self.num_inference_steps else self.final_alpha_cumprod
         )
 
         beta_prod_t = 1 - alpha_prod_t
@@ -179,15 +178,15 @@ def step(
                 variance_noise = torch.randn(
                     model_output.shape, generator=generator, device=device, dtype=model_output.dtype
                 )
-            variance = variance ** (0.5) * eta * variance_noise
+            variance = std_dev_t * variance_noise
 
             prev_sample = prev_sample + variance
 
         return prev_sample
 
     def add_noise(self, init_latents, noise, idx, latent_timestep):
-        sqrt_alpha_prod = self.alphas_cumprod[idx] ** 0.5
-        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[idx]) ** 0.5
+        sqrt_alpha_prod = self.filtered_alphas_cumprod[idx] ** 0.5
+        sqrt_one_minus_alpha_prod = (1 - self.filtered_alphas_cumprod[idx]) ** 0.5
         noisy_latents = sqrt_alpha_prod * init_latents + sqrt_one_minus_alpha_prod * noise
 
         return noisy_latents

diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py
@@ -60,15 +60,8 @@ def __init__(
         self.torch_device = torch.device(device, torch.cuda.current_device())
         self.stages = pipeline_info.stages()
 
-        # TODO: use custom fp16 for ORT_TRT, and no need to fallback to torch.
-        self.vae_torch_fallback = self.pipeline_info.is_xl() and engine_type != EngineType.ORT_CUDA
-
-        # For SD XL, use an VAE that modified to run in fp16 precision without generating NaNs.
-        self.custom_fp16_vae = (
-            "madebyollin/sdxl-vae-fp16-fix"
-            if self.pipeline_info.is_xl() and self.engine_type == EngineType.ORT_CUDA
-            else None
-        )
+        self.vae_torch_fallback = self.pipeline_info.vae_torch_fallback()
+        self.custom_fp16_vae = self.pipeline_info.custom_fp16_vae()
 
         self.models = {}
         self.engines = {}

diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_trt.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_trt.py
@@ -13,21 +13,33 @@
 from diffusion_models import PipelineInfo
 from engine_builder import EngineBuilder, EngineType
 from ort_utils import CudaSession
+from packaging import version
 
 import onnxruntime as ort
 
 logger = logging.getLogger(__name__)
 
 
 class OrtTensorrtEngine(CudaSession):
-    def __init__(self, engine_path, device_id, onnx_path, fp16, input_profile, workspace_size, enable_cuda_graph):
+    def __init__(
+        self,
+        engine_path,
+        device_id,
+        onnx_path,
+        fp16,
+        input_profile,
+        workspace_size,
+        enable_cuda_graph,
+        timing_cache_path=None,
+    ):
         self.engine_path = engine_path
         self.ort_trt_provider_options = self.get_tensorrt_provider_options(
             input_profile,
             workspace_size,
             fp16,
             device_id,
             enable_cuda_graph,
+            timing_cache_path=timing_cache_path,
         )
 
         session_options = ort.SessionOptions()
@@ -45,7 +57,9 @@ def __init__(self, engine_path, device_id, onnx_path, fp16, input_profile, works
         device = torch.device("cuda", device_id)
         super().__init__(ort_session, device, enable_cuda_graph)
 
-    def get_tensorrt_provider_options(self, input_profile, workspace_size, fp16, device_id, enable_cuda_graph):
+    def get_tensorrt_provider_options(
+        self, input_profile, workspace_size, fp16, device_id, enable_cuda_graph, timing_cache_path=None
+    ):
         trt_ep_options = {
             "device_id": device_id,
             "trt_fp16_enable": fp16,
@@ -55,6 +69,9 @@ def get_tensorrt_provider_options(self, input_profile, workspace_size, fp16, dev
             "trt_engine_cache_path": self.engine_path,
         }
 
+        if version.parse(ort.__version__) > version.parse("1.16.2") and timing_cache_path is not None:
+            trt_ep_options["trt_timing_cache_path"] = timing_cache_path
+
         if enable_cuda_graph:
             trt_ep_options["trt_cuda_graph_enable"] = True
 
@@ -153,6 +170,7 @@ def build_engines(
         static_image_shape=True,
         max_workspace_size=0,
         device_id=0,
+        timing_cache=None,
     ):
         self.torch_device = torch.device("cuda", device_id)
         self.load_models(framework_model_dir)
@@ -224,7 +242,6 @@ def build_engines(
 
             engine_path = self.get_engine_path(engine_dir, model_name, profile_id)
             onnx_opt_path = self.get_onnx_path(model_name, onnx_dir, opt=True)
-
             if not self.has_engine_file(engine_path):
                 logger.info(
                     "Building TensorRT engine for %s from %s to %s. It can take a while to complete...",
@@ -251,6 +268,7 @@ def build_engines(
                 input_profile=input_profile,
                 workspace_size=self.get_work_space_size(model_name, max_workspace_size),
                 enable_cuda_graph=self.use_cuda_graph,
+                timing_cache_path=timing_cache,
             )
 
             built_engines[model_name] = engine

diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/ort_optimizer.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/ort_optimizer.py
@@ -91,7 +91,10 @@ def optimize(
         if keep_outputs:
             m.prune_graph(outputs=keep_outputs)
 
-        use_external_data_format = m.model.ByteSize() >= onnx.checker.MAXIMUM_PROTOBUF
+        model_size = m.model.ByteSize()
+
+        # model size might be negative (overflow?) in Windows.
+        use_external_data_format = model_size <= 0 or model_size >= onnx.checker.MAXIMUM_PROTOBUF
 
         # Note that ORT < 1.16 could not save model larger than 2GB.
         # This step is is optional since it has no impact on inference latency.