follow up

2023-12-16 14:37:04 +09:00 · 2023-12-16 14:37:04 +09:00 · 86b56d2d2c
parent 106a1ea68f
commit 86b56d2d2c
21 changed files with 259 additions and 30 deletions
--- a/configs/olive/sd_text_encoder.json
+++ b/configs/olive/sd_text_encoder.json
@ -38,7 +38,7 @@
    }
  },
  "passes": {
-    "optimize": {
+    "optimize_DmlExecutionProvider": {
      "type": "OrtTransformersOptimization",
      "disable_search": true,
      "config": {
@ -73,8 +73,31 @@
          "GroupNorm": [0, 1, 2]
        }
      }
+    },
+    "optimize_CUDAExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "clip",
+        "opt_level": 0,
+        "float16": true,
+        "use_gpu": true,
+        "keep_io_types": false
+      }
+    },
+    "optimize_ROCMExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "clip",
+        "opt_level": 0,
+        "float16": true,
+        "use_gpu": true,
+        "keep_io_types": false
+      }
    }
  },
+  "pass_flows": [[]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/configs/olive/sd_unet.json
+++ b/configs/olive/sd_unet.json
@ -55,7 +55,7 @@
    }
  },
  "passes": {
-    "optimize": {
+    "optimize_DmlExecutionProvider": {
      "type": "OrtTransformersOptimization",
      "disable_search": true,
      "config": {
@ -90,8 +90,31 @@
          "GroupNorm": [0, 1, 2]
        }
      }
+    },
+    "optimize_CUDAExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "unet",
+        "opt_level": 0,
+        "float16": true,
+        "use_gpu": true,
+        "keep_io_types": false
+      }
+    },
+    "optimize_ROCMExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "unet",
+        "opt_level": 0,
+        "float16": true,
+        "use_gpu": true,
+        "keep_io_types": false
+      }
    }
  },
+  "pass_flows": [[]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/configs/olive/sd_vae_decoder.json
+++ b/configs/olive/sd_vae_decoder.json
@ -45,7 +45,7 @@
    }
  },
  "passes": {
-    "optimize": {
+    "optimize_DmlExecutionProvider": {
      "type": "OrtTransformersOptimization",
      "disable_search": true,
      "config": {
@ -80,8 +80,31 @@
          "GroupNorm": [0, 1, 2]
        }
      }
+    },
+    "optimize_CUDAExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "vae",
+        "opt_level": 0,
+        "float16": true,
+        "use_gpu": true,
+        "keep_io_types": false
+      }
+    },
+    "optimize_ROCMExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "vae",
+        "opt_level": 0,
+        "float16": true,
+        "use_gpu": true,
+        "keep_io_types": false
+      }
    }
  },
+  "pass_flows": [[]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/configs/olive/sd_vae_encoder.json
+++ b/configs/olive/sd_vae_encoder.json
@ -45,7 +45,7 @@
    }
  },
  "passes": {
-    "optimize": {
+    "optimize_DmlExecutionProvider": {
      "type": "OrtTransformersOptimization",
      "disable_search": true,
      "config": {
@ -80,8 +80,31 @@
          "GroupNorm": [0, 1, 2]
        }
      }
+    },
+    "optimize_CUDAExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "vae",
+        "opt_level": 0,
+        "float16": true,
+        "use_gpu": true,
+        "keep_io_types": false
+      }
+    },
+    "optimize_ROCMExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "vae",
+        "opt_level": 0,
+        "float16": true,
+        "use_gpu": true,
+        "keep_io_types": false
+      }
    }
  },
+  "pass_flows": [[]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/configs/olive/sdxl_text_encoder.json
+++ b/configs/olive/sdxl_text_encoder.json
@ -71,7 +71,7 @@
    }
  },
  "passes": {
-    "optimize": {
+    "optimize_DmlExecutionProvider": {
      "type": "OrtTransformersOptimization",
      "disable_search": true,
      "config": {
@ -106,8 +106,31 @@
          "GroupNorm": [0, 1, 2]
        }
      }
+    },
+    "optimize_CUDAExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "clip",
+        "opt_level": 0,
+        "float16": true,
+        "use_gpu": true,
+        "keep_io_types": true
+      }
+    },
+    "optimize_ROCMExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "clip",
+        "opt_level": 0,
+        "float16": true,
+        "use_gpu": true,
+        "keep_io_types": true
+      }
    }
  },
+  "pass_flows": [[]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/configs/olive/sdxl_text_encoder_2.json
+++ b/configs/olive/sdxl_text_encoder_2.json
@ -111,7 +111,7 @@
    }
  },
  "passes": {
-    "optimize": {
+    "optimize_DmlExecutionProvider": {
      "type": "OrtTransformersOptimization",
      "disable_search": true,
      "config": {
@ -146,8 +146,31 @@
          "GroupNorm": [0, 1, 2]
        }
      }
+    },
+    "optimize_CUDAExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "clip",
+        "opt_level": 0,
+        "float16": true,
+        "use_gpu": true,
+        "keep_io_types": true
+      }
+    },
+    "optimize_ROCMExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "clip",
+        "opt_level": 0,
+        "float16": true,
+        "use_gpu": true,
+        "keep_io_types": true
+      }
    }
  },
+  "pass_flows": [[]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/configs/olive/sdxl_unet.json
+++ b/configs/olive/sdxl_unet.json
@ -61,7 +61,7 @@
    }
  },
  "passes": {
-    "optimize": {
+    "optimize_DmlExecutionProvider": {
      "type": "OrtTransformersOptimization",
      "disable_search": true,
      "config": {
@ -96,8 +96,31 @@
          "GroupNorm": [0, 1, 2]
        }
      }
+    },
+    "optimize_CUDAExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "unet",
+        "opt_level": 0,
+        "float16": true,
+        "use_gpu": true,
+        "keep_io_types": true
+      }
+    },
+    "optimize_ROCMExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "unet",
+        "opt_level": 0,
+        "float16": true,
+        "use_gpu": true,
+        "keep_io_types": true
+      }
    }
  },
+  "pass_flows": [[]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/configs/olive/sdxl_vae_decoder.json
+++ b/configs/olive/sdxl_vae_decoder.json
@ -51,7 +51,7 @@
    }
  },
  "passes": {
-    "optimize": {
+    "optimize_DmlExecutionProvider": {
      "type": "OrtTransformersOptimization",
      "disable_search": true,
      "config": {
@ -108,8 +108,29 @@
          "GroupNorm": [0, 1, 2]
        }
      }
+    },
+    "optimize_CUDAExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "vae",
+        "opt_level": 0,
+        "float16": false,
+        "use_gpu": true
+      }
+    },
+    "optimize_ROCMExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "vae",
+        "opt_level": 0,
+        "float16": false,
+        "use_gpu": true
+      }
    }
  },
+  "pass_flows": [[]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/configs/olive/sdxl_vae_encoder.json
+++ b/configs/olive/sdxl_vae_encoder.json
@ -86,8 +86,31 @@
          "GroupNorm": [0, 1, 2]
        }
      }
+    },
+    "optimize_CUDAExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "vae",
+        "opt_level": 0,
+        "float16": true,
+        "use_gpu": true,
+        "keep_io_types": true
+      }
+    },
+    "optimize_ROCMExecutionProvider": {
+      "type": "OrtTransformersOptimization",
+      "disable_search": true,
+      "config": {
+        "model_type": "vae",
+        "opt_level": 0,
+        "float16": true,
+        "use_gpu": true,
+        "keep_io_types": true
+      }
    }
  },
+  "pass_flows": [[]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/configs/onnx/sd_text_encoder.json
+++ b/configs/onnx/sd_text_encoder.json
@ -45,6 +45,7 @@
      }
    }
  },
+  "pass_flows": [["convert"]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/configs/onnx/sd_unet.json
+++ b/configs/onnx/sd_unet.json
@ -65,6 +65,7 @@
      }
    }
  },
+  "pass_flows": [["convert"]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/configs/onnx/sd_vae_decoder.json
+++ b/configs/onnx/sd_vae_decoder.json
@ -52,6 +52,7 @@
      }
    }
  },
+  "pass_flows": [["convert"]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/configs/onnx/sd_vae_encoder.json
+++ b/configs/onnx/sd_vae_encoder.json
@ -52,6 +52,7 @@
      }
    }
  },
+  "pass_flows": [["convert"]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/configs/onnx/sdxl_text_encoder.json
+++ b/configs/onnx/sdxl_text_encoder.json
@ -78,6 +78,7 @@
      }
    }
  },
+  "pass_flows": [["convert"]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/configs/onnx/sdxl_text_encoder_2.json
+++ b/configs/onnx/sdxl_text_encoder_2.json
@ -118,6 +118,7 @@
      }
    }
  },
+  "pass_flows": [["convert"]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/configs/onnx/sdxl_unet.json
+++ b/configs/onnx/sdxl_unet.json
@ -71,6 +71,7 @@
      }
    }
  },
+  "pass_flows": [["convert"]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/configs/onnx/sdxl_vae_decoder.json
+++ b/configs/onnx/sdxl_vae_decoder.json
@ -58,6 +58,7 @@
      }
    }
  },
+  "pass_flows": [["convert"]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/configs/onnx/sdxl_vae_encoder.json
+++ b/configs/onnx/sdxl_vae_encoder.json
@ -58,6 +58,7 @@
      }
    }
  },
+  "pass_flows": [["convert"]],
  "engine": {
    "search_strategy": {
      "execution_order": "joint",
--- a/modules/olive.py
+++ b/modules/olive.py
@ -9,7 +9,7 @@ is_sdxl = False
 width = 512
 height = 512
 batch_size = 1
-hidden_state_size = 768
+cross_attention_dim = 768
 time_ids_size = 5


@ -87,14 +87,11 @@ def text_encoder_2_data_loader(data_dir, _, *args, **kwargs):


 def unet_inputs(_, torch_dtype, is_conversion_inputs=False):
-    # TODO (pavignol): All the multiplications by 2 here are bacause the XL base has 2 text encoders
-    # For refiner, it should be multiplied by 1 (single text encoder)
-
    if is_sdxl:
        inputs = {
            "sample": torch.rand((2 * batch_size, 4, height // 8, width // 8), dtype=torch_dtype),
            "timestep": torch.rand((1,), dtype=torch_dtype),
-            "encoder_hidden_states": torch.rand((2 * batch_size, 77, hidden_state_size), dtype=torch_dtype),
+            "encoder_hidden_states": torch.rand((2 * batch_size, 77, cross_attention_dim), dtype=torch_dtype),
        }

        if is_conversion_inputs:
@ -111,20 +108,25 @@ def unet_inputs(_, torch_dtype, is_conversion_inputs=False):
        inputs = {
            "sample": torch.rand((batch_size, 4, height // 8, width // 8), dtype=torch_dtype),
            "timestep": torch.rand((batch_size,), dtype=torch_dtype),
-            "encoder_hidden_states": torch.rand((batch_size, 77, hidden_state_size), dtype=torch_dtype),
-            "return_dict": False,
+            "encoder_hidden_states": torch.rand((batch_size, 77, cross_attention_dim), dtype=torch_dtype),
        }

+        # use as kwargs since they won't be in the correct position if passed along with the tuple of inputs
+        kwargs = {
+            "return_dict": False,
+        }
        if is_conversion_inputs:
            inputs["additional_inputs"] = {
+                **kwargs,
                "added_cond_kwargs": {
                    "text_embeds": torch.rand((1, 1280), dtype=torch_dtype),
-                    "time_ids": torch.rand((1, time_ids_size), dtype=torch_dtype),
-                }
+                    "time_ids": torch.rand((1, 5), dtype=torch_dtype),
+                },
            }
        else:
+            inputs.update(kwargs)
            inputs["onnx::Concat_4"] = torch.rand((1, 1280), dtype=torch_dtype)
-            inputs["onnx::Shape_5"] = torch.rand((1, time_ids_size), dtype=torch_dtype)
+            inputs["onnx::Shape_5"] = torch.rand((1, 5), dtype=torch_dtype)

    return inputs

--- a/modules/onnx.py
+++ b/modules/onnx.py
@ -4,6 +4,7 @@ import torch
 import shutil
 import inspect
 import importlib
+from packaging import version
 import numpy as np
 import onnxruntime as ort
 import diffusers
@ -109,6 +110,9 @@ def load_init_dict(cls: Type[diffusers.DiffusionPipeline], path: os.PathLike):
    R: Dict[str, Tuple[str]] = {}
    for k, v in merged:
        if isinstance(v, list):
+            if v[0] is None or v[1] is None:
+                log.debug(f"Skipping {k} while loading init dict of '{path}': {v}")
+                continue
            R[k] = v
    return R

@ -142,9 +146,17 @@ def load_submodels(path: os.PathLike, init_dict: Dict[str, Type], **kwargs):
    return loaded


+def patch_kwargs(cls: Type[diffusers.DiffusionPipeline], kwargs: Dict) -> Dict:
+    if cls == OnnxStableDiffusionPipeline or cls == OnnxStableDiffusionImg2ImgPipeline or cls == OnnxStableDiffusionInpaintPipeline:
+        kwargs["safety_checker"] = None
+        kwargs["requires_safety_checker"] = False
+
+    return kwargs
+
+
 def load_pipeline(cls: Type[diffusers.DiffusionPipeline], path: os.PathLike):
    if os.path.isdir(path):
-        return cls(**load_submodels(path, load_init_dict(cls, path)))
+        return cls(**patch_kwargs(cls, load_submodels(path, load_init_dict(cls, path))))
    else:
        return cls.from_single_file(path)

@ -284,8 +296,7 @@ class OnnxRawPipeline(OnnxPipelineBase):
                if submodel in init_dict:
                    del init_dict[submodel] # already loaded as OnnxRuntimeModel.
            kwargs.update(load_submodels(in_dir, init_dict)) # load others.
-            kwargs["safety_checker"] = None
-            kwargs["requires_safety_checker"] = False
+            kwargs = patch_kwargs(self.constructor, kwargs)

            pipeline = self.constructor(**kwargs)
            pipeline.to_json_file(os.path.join(out_dir, "model_index.json"))
@ -353,11 +364,13 @@ class OnnxRawPipeline(OnnxPipelineBase):

                with open(os.path.join(sd_configs_path, "olive", f"{'sdxl' if self._is_sdxl else 'sd'}_{submodel}.json"), "r") as config_file:
                    olive_config = json.load(config_file)
+                pass_key = f"optimize_{shared.opts.onnx_execution_provider}"
+                olive_config["pass_flows"] = [[pass_key]]
                olive_config["input_model"]["config"]["model_path"] = os.path.abspath(os.path.join(in_dir, submodel, "model.onnx"))
-                olive_config["passes"]["optimize"]["config"]["float16"] = shared.opts.onnx_olive_float16
-                if (submodel == "unet" or "vae" in submodel) and (shared.opts.onnx_execution_provider == ExecutionProvider.CUDA or shared.opts.onnx_execution_provider == ExecutionProvider.ROCm):
-                    olive_config["passes"]["optimize"]["config"]["optimization_options"]["group_norm_channels_last"] = True
+                olive_config["passes"][pass_key]["config"]["float16"] = shared.opts.onnx_olive_float16
                olive_config["engine"]["execution_providers"] = [shared.opts.onnx_execution_provider]
+                if (shared.opts.onnx_execution_provider == ExecutionProvider.CUDA or shared.opts.onnx_execution_provider == ExecutionProvider.ROCm) and version.parse(ort.__version__) < version.parse("1.17.0"):
+                    olive_config["passes"][pass_key]["config"]["optimization_options"] = {"enable_skip_group_norm": False}

                run(olive_config)

@ -388,8 +401,7 @@ class OnnxRawPipeline(OnnxPipelineBase):
                if submodel in init_dict:
                    del init_dict[submodel] # already loaded as OnnxRuntimeModel.
            kwargs.update(load_submodels(in_dir, init_dict)) # load others.
-            kwargs["safety_checker"] = None
-            kwargs["requires_safety_checker"] = False
+            kwargs = patch_kwargs(self.constructor, kwargs)

            pipeline = self.constructor(**kwargs)
            pipeline.to_json_file(os.path.join(out_dir, "model_index.json"))
@ -416,16 +428,18 @@ class OnnxRawPipeline(OnnxPipelineBase):
            return None

    def preprocess(self, width: int, height: int, batch_size: int):
+        if not shared.cmd_opts.debug:
+            ort.set_default_logger_severity(3)
        olive.width = width
        olive.height = height
        olive.batch_size = batch_size

        olive.is_sdxl = self._is_sdxl
        if olive.is_sdxl:
-            olive.hidden_state_size = 2048
+            olive.cross_attention_dim = 2048
            olive.time_ids_size = 6
        else:
-            olive.hidden_state_size = height + 256
+            olive.cross_attention_dim = height + 256
            olive.time_ids_size = 5

        converted_dir = self.convert(self.path if os.path.isdir(self.path) else shared.opts.onnx_temp_dir)
--- a/modules/processing_diffusers.py
+++ b/modules/processing_diffusers.py
@ -7,7 +7,6 @@ import torch
 import torchvision.transforms.functional as TF
 import diffusers
 from modules import shared, devices, processing, sd_samplers, sd_models, images, errors, masking, prompt_parser_diffusers, sd_hijack_hypertile, processing_correction, processing_vae
-from modules.olive import OlivePipeline


 debug = shared.log.trace if os.environ.get('SD_DIFFUSERS_DEBUG', None) is not None else lambda *args, **kwargs: None
@ -223,7 +222,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
            generator = [torch.Generator(generator_device).manual_seed(s) for s in p.seeds]
        prompts, negative_prompts, prompts_2, negative_prompts_2 = fix_prompts(prompts, negative_prompts, prompts_2, negative_prompts_2)
        parser = 'Fixed attention'
-        if shared.opts.prompt_attention != 'Fixed attention' and 'StableDiffusion' in model.__class__.__name__ and not isinstance(model, diffusers.OnnxStableDiffusionPipeline):
+        if shared.opts.prompt_attention != 'Fixed attention' and 'StableDiffusion' in model.__class__.__name__ and not isinstance(model, OnnxStableDiffusionPipeline):
            try:
                prompt_parser_diffusers.encode_prompts(model, p, prompts, negative_prompts, kwargs.get("num_inference_steps", 1), kwargs.pop("clip_skip", None))
                parser = shared.opts.prompt_attention