Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 128 additions & 56 deletions onnxruntime/python/tools/transformers/models/stable_diffusion/README.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@
f"Batch size {len(prompt)} is larger than allowed {max_batch_size}. If dynamic shape is used, then maximum batch size is 4"
)

pipeline_info = PipelineInfo(args.version)
min_image_size = 512
max_image_size = 1024 if args.version in ["2.0", "2.1"] else 768
pipeline_info = PipelineInfo(args.version, min_image_size=min_image_size, max_image_size=max_image_size)
pipeline = init_pipeline(Txt2ImgPipeline, pipeline_info, engine_type, args, max_batch_size, batch_size)

if engine_type == EngineType.TRT:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,18 @@ def load_pipelines(args, batch_size):
if batch_size > max_batch_size:
raise ValueError(f"Batch size {batch_size} is larger than allowed {max_batch_size}.")

# For TensorRT, performance of engine built with dynamic shape is very sensitive to the range of image size.
# Here, we reduce the range of image size for TensorRT to trade-off flexibility and performance.
min_image_size = 832 if args.engine != "ORT_CUDA" else 512
max_image_size = 1216 if args.engine != "ORT_CUDA" else 2048

# No VAE decoder in base when it outputs latent instead of image.
base_info = PipelineInfo(args.version, use_vae=False, min_image_size=640, max_image_size=1536)
base_info = PipelineInfo(args.version, use_vae=False, min_image_size=min_image_size, max_image_size=max_image_size)
base = init_pipeline(Txt2ImgXLPipeline, base_info, engine_type, args, max_batch_size, batch_size)

refiner_info = PipelineInfo(args.version, is_refiner=True, min_image_size=640, max_image_size=1536)
refiner_info = PipelineInfo(
args.version, is_refiner=True, min_image_size=min_image_size, max_image_size=max_image_size
)
refiner = init_pipeline(Img2ImgXLPipeline, refiner_info, engine_type, args, max_batch_size, batch_size)

if engine_type == EngineType.TRT:
Expand Down Expand Up @@ -139,52 +146,57 @@ def run_demo(args):


def run_dynamic_shape_demo(args):
"""Run demo of generating images with different size with list of prompts with ORT CUDA provider."""
"""Run demo of generating images with different settings with ORT CUDA provider."""
args.engine = "ORT_CUDA"
args.scheduler = "UniPC"
args.denoising_steps = 8
args.disable_cuda_graph = True
base, refiner = load_pipelines(args, 1)

batch_size = args.repeat_prompt
base, refiner = load_pipelines(args, batch_size)
prompts = [
"starry night over Golden Gate Bridge by van gogh",
"beautiful photograph of Mt. Fuji during cherry blossom",
"little cute gremlin sitting on a bed, cinematic",
"cute grey cat with blue eyes, wearing a bowtie, acrylic painting",
"beautiful Renaissance Revival Estate, Hobbit-House, detailed painting, warm colors, 8k, trending on Artstation",
"blue owl, big green eyes, portrait, intricate metal design, unreal engine, octane render, realistic",
]

image_sizes = [
(1024, 1024),
(1152, 896),
(896, 1152),
(1216, 832),
(832, 1216),
(1344, 768),
(768, 1344),
(1536, 640),
(640, 1536),
# batch size, height, width, scheduler, steps, prompt
configs = [
(1, 832, 1216, "UniPC", 8, prompts[0]),
(1, 1024, 1024, "DDIM", 24, prompts[1]),
(1, 1216, 832, "EulerA", 18, prompts[2]),
(2, 1344, 768, "DDIM", 30, prompts[3]),
(2, 640, 1536, "UniPC", 18, prompts[4]),
(2, 1152, 896, "EulerA", 30, prompts[5]),
]

# Warm up the pipelines. This only need once before serving.
# Warm up (for cudnn convolution algo search) once before serving.
args.prompt = ["warm up"]
args.num_warmup_runs = 3
prompt, negative_prompt = repeat_prompt(args)
for height, width in image_sizes:
args.num_warmup_runs = 1
for batch_size, height, width, _, _, _ in configs:
args.batch_size = batch_size
args.height = height
args.width = width
print(f"\nWarm up pipelines for Batch_size={batch_size}, Height={height}, Width={width}")
print(f"\nWarm up batch_size={batch_size}, height={height}, width={width}")
prompt, negative_prompt = repeat_prompt(args)
run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=True)

# Run pipeline on a list of prompts.
prompts = [
"starry night over Golden Gate Bridge by van gogh",
"little cute gremlin sitting on a bed, cinematic",
]
args.num_warmup_runs = 0
for example_prompt in prompts:
for batch_size, height, width, scheduler, steps, example_prompt in configs:
args.prompt = [example_prompt]
args.batch_size = batch_size
args.height = height
args.width = width
args.scheduler = scheduler
args.denoising_steps = steps
base.set_scheduler(scheduler)
refiner.set_scheduler(scheduler)
print(
f"\nbatch_size={batch_size}, height={height}, width={width}, scheduler={scheduler}, steps={steps}, prompt={example_prompt}"
)
prompt, negative_prompt = repeat_prompt(args)

for height, width in image_sizes:
args.height = height
args.width = width
print(f"\nBatch_size={batch_size}, Height={height}, Width={width}, Prompt={example_prompt}")
run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=False)
run_pipelines(args, base, refiner, prompt, negative_prompt, is_warm_up=False)

base.teardown()
refiner.teardown()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def parse_arguments(is_xl: bool, description: str):
"--scheduler",
type=str,
default="DDIM",
choices=["DDIM", "UniPC"] if is_xl else ["DDIM", "EulerA", "UniPC"],
choices=["DDIM", "EulerA", "UniPC"],
help="Scheduler for diffusion process",
)

Expand All @@ -84,7 +84,7 @@ def parse_arguments(is_xl: bool, description: str):
"--negative-prompt", nargs="*", default=[""], help="Optional negative prompt(s) to guide the image generation."
)
parser.add_argument(
"--repeat-prompt",
"--batch-size",
type=int,
default=1,
choices=[1, 2, 4, 8, 16],
Expand Down Expand Up @@ -194,7 +194,7 @@ def parse_arguments(is_xl: bool, description: str):
def repeat_prompt(args):
if not isinstance(args.prompt, list):
raise ValueError(f"`prompt` must be of type `str` or `str` list, but is {type(args.prompt)}")
prompt = args.prompt * args.repeat_prompt
prompt = args.prompt * args.batch_size

if not isinstance(args.negative_prompt, list):
raise ValueError(
Expand Down Expand Up @@ -255,6 +255,7 @@ def init_pipeline(pipeline_class, pipeline_info, engine_type, args, max_batch_si
static_image_shape=not args.build_dynamic_shape,
max_workspace_size=0,
device_id=torch.cuda.current_device(),
timing_cache=timing_cache,
)
elif engine_type == EngineType.TRT:
# Load TensorRT engines and pytorch modules
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,15 @@ def __init__(
use_vae=False,
min_image_size=256,
max_image_size=1024,
use_fp16_vae=True,
):
self.version = version
self._is_inpaint = is_inpaint
self._is_refiner = is_refiner
self._use_vae = use_vae
self._min_image_size = min_image_size
self._max_image_size = max_image_size
self._use_fp16_vae = use_fp16_vae
if is_refiner:
assert self.is_xl()

Expand Down Expand Up @@ -127,6 +129,13 @@ def stages(self) -> List[str]:
def vae_scaling_factor(self) -> float:
return 0.13025 if self.is_xl() else 0.18215

def vae_torch_fallback(self) -> bool:
return self.is_xl() and not self._use_fp16_vae

def custom_fp16_vae(self) -> Optional[str]:
# For SD XL, use a VAE that fine-tuned to run in fp16 precision without generating NaNs
return "madebyollin/sdxl-vae-fp16-fix" if self._use_fp16_vae and self.is_xl() else None

@staticmethod
def supported_versions(is_xl: bool):
return ["xl-1.0"] if is_xl else ["1.4", "1.5", "2.0-base", "2.0", "2.1", "2.1-base"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ def __init__(

alphas = 1.0 - betas
self.alphas_cumprod = torch.cumprod(alphas, dim=0)

# standard deviation of the initial noise distribution
self.init_noise_sigma = 1.0

Expand All @@ -71,7 +70,7 @@ def configure(self):
self.variance = torch.from_numpy(variance).to(self.device)

timesteps = self.timesteps.long().cpu()
self.alphas_cumprod = self.alphas_cumprod[timesteps].to(self.device)
self.filtered_alphas_cumprod = self.alphas_cumprod[timesteps].to(self.device)
self.final_alpha_cumprod = self.final_alpha_cumprod.to(self.device)

def scale_model_input(self, sample: torch.FloatTensor, idx, *args, **kwargs) -> torch.FloatTensor:
Expand Down Expand Up @@ -124,9 +123,9 @@ def step(
# - pred_prev_sample -> "x_t-1"

prev_idx = idx + 1
alpha_prod_t = self.alphas_cumprod[idx]
alpha_prod_t = self.filtered_alphas_cumprod[idx]
alpha_prod_t_prev = (
self.alphas_cumprod[prev_idx] if prev_idx < self.num_inference_steps else self.final_alpha_cumprod
self.filtered_alphas_cumprod[prev_idx] if prev_idx < self.num_inference_steps else self.final_alpha_cumprod
)

beta_prod_t = 1 - alpha_prod_t
Expand Down Expand Up @@ -179,15 +178,15 @@ def step(
variance_noise = torch.randn(
model_output.shape, generator=generator, device=device, dtype=model_output.dtype
)
variance = variance ** (0.5) * eta * variance_noise
variance = std_dev_t * variance_noise

prev_sample = prev_sample + variance

return prev_sample

def add_noise(self, init_latents, noise, idx, latent_timestep):
sqrt_alpha_prod = self.alphas_cumprod[idx] ** 0.5
sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[idx]) ** 0.5
sqrt_alpha_prod = self.filtered_alphas_cumprod[idx] ** 0.5
sqrt_one_minus_alpha_prod = (1 - self.filtered_alphas_cumprod[idx]) ** 0.5
noisy_latents = sqrt_alpha_prod * init_latents + sqrt_one_minus_alpha_prod * noise

return noisy_latents
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,8 @@ def __init__(
self.torch_device = torch.device(device, torch.cuda.current_device())
self.stages = pipeline_info.stages()

# TODO: use custom fp16 for ORT_TRT, and no need to fallback to torch.
self.vae_torch_fallback = self.pipeline_info.is_xl() and engine_type != EngineType.ORT_CUDA

# For SD XL, use an VAE that modified to run in fp16 precision without generating NaNs.
self.custom_fp16_vae = (
"madebyollin/sdxl-vae-fp16-fix"
if self.pipeline_info.is_xl() and self.engine_type == EngineType.ORT_CUDA
else None
)
self.vae_torch_fallback = self.pipeline_info.vae_torch_fallback()
self.custom_fp16_vae = self.pipeline_info.custom_fp16_vae()

self.models = {}
self.engines = {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,33 @@
from diffusion_models import PipelineInfo
from engine_builder import EngineBuilder, EngineType
from ort_utils import CudaSession
from packaging import version

import onnxruntime as ort

logger = logging.getLogger(__name__)


class OrtTensorrtEngine(CudaSession):
def __init__(self, engine_path, device_id, onnx_path, fp16, input_profile, workspace_size, enable_cuda_graph):
def __init__(
self,
engine_path,
device_id,
onnx_path,
fp16,
input_profile,
workspace_size,
enable_cuda_graph,
timing_cache_path=None,
):
self.engine_path = engine_path
self.ort_trt_provider_options = self.get_tensorrt_provider_options(
input_profile,
workspace_size,
fp16,
device_id,
enable_cuda_graph,
timing_cache_path=timing_cache_path,
)

session_options = ort.SessionOptions()
Expand All @@ -45,7 +57,9 @@ def __init__(self, engine_path, device_id, onnx_path, fp16, input_profile, works
device = torch.device("cuda", device_id)
super().__init__(ort_session, device, enable_cuda_graph)

def get_tensorrt_provider_options(self, input_profile, workspace_size, fp16, device_id, enable_cuda_graph):
def get_tensorrt_provider_options(
self, input_profile, workspace_size, fp16, device_id, enable_cuda_graph, timing_cache_path=None
):
trt_ep_options = {
"device_id": device_id,
"trt_fp16_enable": fp16,
Expand All @@ -55,6 +69,9 @@ def get_tensorrt_provider_options(self, input_profile, workspace_size, fp16, dev
"trt_engine_cache_path": self.engine_path,
}

if version.parse(ort.__version__) > version.parse("1.16.2") and timing_cache_path is not None:
trt_ep_options["trt_timing_cache_path"] = timing_cache_path

if enable_cuda_graph:
trt_ep_options["trt_cuda_graph_enable"] = True

Expand Down Expand Up @@ -153,6 +170,7 @@ def build_engines(
static_image_shape=True,
max_workspace_size=0,
device_id=0,
timing_cache=None,
):
self.torch_device = torch.device("cuda", device_id)
self.load_models(framework_model_dir)
Expand Down Expand Up @@ -224,7 +242,6 @@ def build_engines(

engine_path = self.get_engine_path(engine_dir, model_name, profile_id)
onnx_opt_path = self.get_onnx_path(model_name, onnx_dir, opt=True)

if not self.has_engine_file(engine_path):
logger.info(
"Building TensorRT engine for %s from %s to %s. It can take a while to complete...",
Expand All @@ -251,6 +268,7 @@ def build_engines(
input_profile=input_profile,
workspace_size=self.get_work_space_size(model_name, max_workspace_size),
enable_cuda_graph=self.use_cuda_graph,
timing_cache_path=timing_cache,
)

built_engines[model_name] = engine
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,10 @@ def optimize(
if keep_outputs:
m.prune_graph(outputs=keep_outputs)

use_external_data_format = m.model.ByteSize() >= onnx.checker.MAXIMUM_PROTOBUF
model_size = m.model.ByteSize()

# model size might be negative (overflow?) in Windows.
use_external_data_format = model_size <= 0 or model_size >= onnx.checker.MAXIMUM_PROTOBUF

# Note that ORT < 1.16 could not save model larger than 2GB.
# This step is is optional since it has no impact on inference latency.
Expand Down
Loading