|
| 1 | +# |
| 2 | +# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 3 | +# SPDX-License-Identifier: Apache-2.0 |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | +# |
| 17 | + |
| 18 | +import argparse |
| 19 | + |
| 20 | +from PIL import Image |
| 21 | + |
| 22 | +from stable_video_diffusion_pipeline import StableVideoDiffusionPipeline |
| 23 | +from utilities import ( |
| 24 | + PIPELINE_TYPE, |
| 25 | + add_arguments, |
| 26 | + download_image, |
| 27 | +) |
| 28 | + |
| 29 | +def parseArgs(): |
| 30 | + parser = argparse.ArgumentParser(description="Options for Stable Diffusion Img2Vid Demo", conflict_handler='resolve') |
| 31 | + parser = add_arguments(parser) |
| 32 | + parser.add_argument('--version', type=str, default="svd-xt-1.1", choices=["svd-xt-1.1"], help="Version of Stable Video Diffusion") |
| 33 | + parser.add_argument('--input-image', type=str, default="", help="Path to the input image") |
| 34 | + parser.add_argument('--height', type=int, default=576, help="Height of image to generate (must be multiple of 8)") |
| 35 | + parser.add_argument('--width', type=int, default=1024, help="Width of image to generate (must be multiple of 8)") |
| 36 | + parser.add_argument('--min-guidance-scale', type=float, default=1.0, help="The minimum guidance scale. Used for the classifier free guidance with first frame") |
| 37 | + parser.add_argument('--max-guidance-scale', type=float, default=3.0, help="The maximum guidance scale. Used for the classifier free guidance with last frame") |
| 38 | + parser.add_argument('--denoising-steps', type=int, default=25, help="Number of denoising steps") |
| 39 | + parser.add_argument('--num-warmup-runs', type=int, default=1, help="Number of warmup runs before benchmarking performance") |
| 40 | + return parser.parse_args() |
| 41 | + |
| 42 | +def process_pipeline_args(args): |
| 43 | + |
| 44 | + if not args.input_image: |
| 45 | + args.input_image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png?download=true" |
| 46 | + if isinstance(args.input_image, str): |
| 47 | + input_image = download_image(args.input_image).resize((args.width, args.height)) |
| 48 | + elif isinstance(args.input_image, Image.Image): |
| 49 | + input_image = Image.open(args.input_image) |
| 50 | + else: |
| 51 | + raise ValueError(f"Input image(s) must be of type `PIL.Image.Image` or `str` (URL) but is {type(args.input_image)}") |
| 52 | + |
| 53 | + if args.height % 8 != 0 or args.width % 8 != 0: |
| 54 | + raise ValueError(f"Image height and width have to be divisible by 8 but are: {args.image_height} and {args.width}.") |
| 55 | + |
| 56 | + # TODO enable BS>1 |
| 57 | + max_batch_size = 1 |
| 58 | + args.build_static_batch = True |
| 59 | + |
| 60 | + if args.batch_size > max_batch_size: |
| 61 | + raise ValueError(f"Batch size {args.batch_size} is larger than allowed {max_batch_size}.") |
| 62 | + |
| 63 | + if not args.build_static_batch or args.build_dynamic_shape: |
| 64 | + raise ValueError(f"Dynamic shapes not supported. Do not specify `--build-dynamic-shape`") |
| 65 | + |
| 66 | + kwargs_init_pipeline = { |
| 67 | + 'version': args.version, |
| 68 | + 'max_batch_size': max_batch_size, |
| 69 | + 'denoising_steps': args.denoising_steps, |
| 70 | + 'scheduler': args.scheduler, |
| 71 | + 'min_guidance_scale': args.min_guidance_scale, |
| 72 | + 'max_guidance_scale': args.max_guidance_scale, |
| 73 | + 'output_dir': args.output_dir, |
| 74 | + 'hf_token': args.hf_token, |
| 75 | + 'verbose': args.verbose, |
| 76 | + 'nvtx_profile': args.nvtx_profile, |
| 77 | + 'use_cuda_graph': args.use_cuda_graph, |
| 78 | + 'framework_model_dir': args.framework_model_dir, |
| 79 | + 'torch_inference': args.torch_inference, |
| 80 | + } |
| 81 | + |
| 82 | + kwargs_load_engine = { |
| 83 | + 'onnx_opset': args.onnx_opset, |
| 84 | + 'opt_batch_size': args.batch_size, |
| 85 | + 'opt_image_height': args.height, |
| 86 | + 'opt_image_width': args.width, |
| 87 | + 'static_batch': args.build_static_batch, |
| 88 | + 'static_shape': not args.build_dynamic_shape, |
| 89 | + 'enable_all_tactics': args.build_all_tactics, |
| 90 | + 'enable_refit': args.build_enable_refit, |
| 91 | + 'timing_cache': args.timing_cache, |
| 92 | + } |
| 93 | + |
| 94 | + args_run_demo = (input_image, args.height, args.width, args.batch_size, args.batch_count, args.num_warmup_runs, args.use_cuda_graph) |
| 95 | + |
| 96 | + return kwargs_init_pipeline, kwargs_load_engine, args_run_demo |
| 97 | + |
| 98 | +if __name__ == "__main__": |
| 99 | + print("[I] Initializing StableDiffusion img2vid demo using TensorRT") |
| 100 | + args = parseArgs() |
| 101 | + kwargs_init_pipeline, kwargs_load_engine, args_run_demo = process_pipeline_args(args) |
| 102 | + |
| 103 | + # Initialize demo |
| 104 | + demo = StableVideoDiffusionPipeline( |
| 105 | + pipeline_type=PIPELINE_TYPE.IMG2VID, |
| 106 | + **kwargs_init_pipeline) |
| 107 | + demo.loadEngines( |
| 108 | + args.engine_dir, |
| 109 | + args.framework_model_dir, |
| 110 | + args.onnx_dir, |
| 111 | + **kwargs_load_engine) |
| 112 | + demo.loadResources(args.height, args.width, args.batch_size, args.seed) |
| 113 | + |
| 114 | + # Run inference |
| 115 | + demo.run(*args_run_demo) |
| 116 | + |
| 117 | + demo.teardown() |
0 commit comments