diff --git a/.ruff.toml b/.ruff.toml index 89c979e5d..ea91ba5a5 100644 --- a/.ruff.toml +++ b/.ruff.toml @@ -15,7 +15,6 @@ exclude = [ "modules/xadapter", "modules/intel/openvino", "modules/intel/ipex", - "modules/dml", "modules/segmoe", "modules/control/proc", "modules/control/units", diff --git a/modules/dml/device.py b/modules/dml/device.py index b5e2c8a36..ae4d32a99 100644 --- a/modules/dml/device.py +++ b/modules/dml/device.py @@ -4,11 +4,14 @@ from .utils import rDevice, get_device class Device: + idx: int + def __enter__(self, device: Optional[rDevice]=None): torch.dml.context_device = get_device(device) + self.idx = torch.dml.context_device.index def __init__(self, device: Optional[rDevice]=None) -> torch.device: # pylint: disable=return-in-init - return get_device(device) + self.idx = get_device(device).index def __exit__(self, t, v, tb): torch.dml.context_device = None diff --git a/modules/dml/hijack/diffusers.py b/modules/dml/hijack/diffusers.py index 4896ccd3e..56b7d85cb 100644 --- a/modules/dml/hijack/diffusers.py +++ b/modules/dml/hijack/diffusers.py @@ -4,19 +4,8 @@ import diffusers import diffusers.utils.torch_utils +# copied from diffusers.PNDMScheduler._get_prev_sample def PNDMScheduler__get_prev_sample(self, sample: torch.FloatTensor, timestep, prev_timestep, model_output): - # See formula (9) of PNDM paper https://arxiv.org/pdf/2202.09778.pdf - # this function computes x_(t−δ) using the formula of (9) - # Note that x_t needs to be added to both sides of the equation - - # Notation ( -> - # alpha_prod_t -> α_t - # alpha_prod_t_prev -> α_(t−δ) - # beta_prod_t -> (1 - α_t) - # beta_prod_t_prev -> (1 - α_(t−δ)) - # sample -> x_t - # model_output -> e_θ(x_t, t) - # prev_sample -> x_(t−δ) torch.dml.synchronize_tensor(sample) # DML synchronize alpha_prod_t = self.alphas_cumprod[timestep] alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod @@ -30,13 +19,8 @@ def PNDMScheduler__get_prev_sample(self, sample: torch.FloatTensor, timestep, pr f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `v_prediction`" ) - # corresponds to (α_(t−δ) - α_t) divided by - # denominator of x_t in formula (9) and plus 1 - # Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) = - # sqrt(α_(t−δ)) / sqrt(α_t)) sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5) - # corresponds to denominator of e_θ(x_t, t) in formula (9) model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + ( alpha_prod_t * beta_prod_t * alpha_prod_t_prev ) ** (0.5) @@ -52,31 +36,15 @@ def PNDMScheduler__get_prev_sample(self, sample: torch.FloatTensor, timestep, pr diffusers.PNDMScheduler._get_prev_sample = PNDMScheduler__get_prev_sample # pylint: disable=protected-access +# copied from diffusers.UniPCMultistepScheduler.multistep_uni_p_bh_update def UniPCMultistepScheduler_multistep_uni_p_bh_update( - self, + self: diffusers.UniPCMultistepScheduler, model_output: torch.FloatTensor, *args, sample: torch.FloatTensor = None, order: int = None, - **kwargs, + **_, ) -> torch.FloatTensor: - """ - One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified. - - Args: - model_output (`torch.FloatTensor`): - The direct output from the learned diffusion model at the current timestep. - prev_timestep (`int`): - The previous discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): - A current instance of a sample created by the diffusion process. - order (`int`): - The order of UniP at this timestep (corresponds to the *p* in UniPC-p). - - Returns: - `torch.FloatTensor`: - The sample tensor at the previous timestep. - """ if sample is None: if len(args) > 1: sample = args[1] @@ -136,7 +104,7 @@ def UniPCMultistepScheduler_multistep_uni_p_bh_update( elif self.config.solver_type == "bh2": B_h = torch.expm1(hh) else: - raise NotImplementedError() + raise NotImplementedError for i in range(1, order + 1): R.append(torch.pow(rks, i - 1)) @@ -147,6 +115,7 @@ def UniPCMultistepScheduler_multistep_uni_p_bh_update( R = torch.stack(R) b = torch.tensor(b, device=device) + rhos_p = None if len(D1s) > 0: D1s = torch.stack(D1s, dim=1) # (B, K) # for order 2, we use a simplified version @@ -179,34 +148,15 @@ def UniPCMultistepScheduler_multistep_uni_p_bh_update( diffusers.UniPCMultistepScheduler.multistep_uni_p_bh_update = UniPCMultistepScheduler_multistep_uni_p_bh_update +# copied from diffusers.LCMScheduler.step def LCMScheduler_step( - self, + self: diffusers.LCMScheduler, model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, ) -> Union[diffusers.schedulers.scheduling_lcm.LCMSchedulerOutput, Tuple]: - """ - Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion - process from the learned model outputs (most often the predicted noise). - - Args: - model_output (`torch.FloatTensor`): - The direct output from learned diffusion model. - timestep (`float`): - The current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): - A current instance of a sample created by the diffusion process. - generator (`torch.Generator`, *optional*): - A random number generator. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`. - Returns: - [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`: - If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a - tuple is returned where the first element is the sample tensor. - """ if self.num_inference_steps is None: raise ValueError( "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" diff --git a/modules/dml/hijack/stablediffusion.py b/modules/dml/hijack/stablediffusion.py index 818250181..fc2518aa7 100644 --- a/modules/dml/hijack/stablediffusion.py +++ b/modules/dml/hijack/stablediffusion.py @@ -68,7 +68,7 @@ def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=F pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0) if dynamic_threshold is not None: - raise NotImplementedError() + raise NotImplementedError # direction pointing to x_t dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t diff --git a/modules/zluda.py b/modules/zluda.py index 9737d2493..71f870267 100644 --- a/modules/zluda.py +++ b/modules/zluda.py @@ -3,6 +3,7 @@ import sys from typing import Union import torch from torch._prims_common import DeviceLikeType +import onnxruntime as ort from modules import shared, devices @@ -57,8 +58,14 @@ def initialize_zluda(): if hasattr(torch.backends.cuda, "enable_cudnn_sdp"): torch.backends.cuda.enable_cudnn_sdp(False) torch.backends.cuda.enable_cudnn_sdp = do_nothing - shared.opts.sdp_options = ['Math attention'] + + # ONNX Runtime is not supported + ort.capi._pybind_state.get_available_providers = lambda: [v for v in ort.get_available_providers() if v != 'CUDAExecutionProvider'] # pylint: disable=protected-access + ort.get_available_providers = ort.capi._pybind_state.get_available_providers # pylint: disable=protected-access + if shared.opts.onnx_execution_provider == 'CUDAExecutionProvider': + shared.opts.onnx_execution_provider = 'CPUExecutionProvider' + devices.device_codeformer = devices.cpu result = test(device)