diff --git a/.ruff.toml b/.ruff.toml
index 89c979e5d..ea91ba5a5 100644
--- a/.ruff.toml
+++ b/.ruff.toml
@@ -15,7 +15,6 @@ exclude = [
     "modules/xadapter",
     "modules/intel/openvino",
     "modules/intel/ipex",
-    "modules/dml",
     "modules/segmoe",
     "modules/control/proc",
     "modules/control/units",
diff --git a/modules/dml/device.py b/modules/dml/device.py
index b5e2c8a36..ae4d32a99 100644
--- a/modules/dml/device.py
+++ b/modules/dml/device.py
@@ -4,11 +4,14 @@ from .utils import rDevice, get_device
 
 
 class Device:
+    idx: int
+
     def __enter__(self, device: Optional[rDevice]=None):
         torch.dml.context_device = get_device(device)
+        self.idx = torch.dml.context_device.index
 
     def __init__(self, device: Optional[rDevice]=None) -> torch.device: # pylint: disable=return-in-init
-        return get_device(device)
+        self.idx = get_device(device).index
 
     def __exit__(self, t, v, tb):
         torch.dml.context_device = None
diff --git a/modules/dml/hijack/diffusers.py b/modules/dml/hijack/diffusers.py
index 4896ccd3e..56b7d85cb 100644
--- a/modules/dml/hijack/diffusers.py
+++ b/modules/dml/hijack/diffusers.py
@@ -4,19 +4,8 @@ import diffusers
 import diffusers.utils.torch_utils
 
 
+# copied from diffusers.PNDMScheduler._get_prev_sample
 def PNDMScheduler__get_prev_sample(self, sample: torch.FloatTensor, timestep, prev_timestep, model_output):
-    # See formula (9) of PNDM paper https://arxiv.org/pdf/2202.09778.pdf
-    # this function computes x_(t−δ) using the formula of (9)
-    # Note that x_t needs to be added to both sides of the equation
-
-    # Notation (<variable name> -> <name in paper>
-    # alpha_prod_t -> α_t
-    # alpha_prod_t_prev -> α_(t−δ)
-    # beta_prod_t -> (1 - α_t)
-    # beta_prod_t_prev -> (1 - α_(t−δ))
-    # sample -> x_t
-    # model_output -> e_θ(x_t, t)
-    # prev_sample -> x_(t−δ)
     torch.dml.synchronize_tensor(sample) # DML synchronize
     alpha_prod_t = self.alphas_cumprod[timestep]
     alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
@@ -30,13 +19,8 @@ def PNDMScheduler__get_prev_sample(self, sample: torch.FloatTensor, timestep, pr
             f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `v_prediction`"
         )
 
-    # corresponds to (α_(t−δ) - α_t) divided by
-    # denominator of x_t in formula (9) and plus 1
-    # Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) =
-    # sqrt(α_(t−δ)) / sqrt(α_t))
     sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
 
-    # corresponds to denominator of e_θ(x_t, t) in formula (9)
     model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
         alpha_prod_t * beta_prod_t * alpha_prod_t_prev
     ) ** (0.5)
@@ -52,31 +36,15 @@ def PNDMScheduler__get_prev_sample(self, sample: torch.FloatTensor, timestep, pr
 diffusers.PNDMScheduler._get_prev_sample = PNDMScheduler__get_prev_sample # pylint: disable=protected-access
 
 
+# copied from diffusers.UniPCMultistepScheduler.multistep_uni_p_bh_update
 def UniPCMultistepScheduler_multistep_uni_p_bh_update(
-    self,
+    self: diffusers.UniPCMultistepScheduler,
     model_output: torch.FloatTensor,
     *args,
     sample: torch.FloatTensor = None,
     order: int = None,
-    **kwargs,
+    **_,
 ) -> torch.FloatTensor:
-    """
-    One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified.
-
-    Args:
-        model_output (`torch.FloatTensor`):
-            The direct output from the learned diffusion model at the current timestep.
-        prev_timestep (`int`):
-            The previous discrete timestep in the diffusion chain.
-        sample (`torch.FloatTensor`):
-            A current instance of a sample created by the diffusion process.
-        order (`int`):
-            The order of UniP at this timestep (corresponds to the *p* in UniPC-p).
-
-    Returns:
-        `torch.FloatTensor`:
-            The sample tensor at the previous timestep.
-    """
     if sample is None:
         if len(args) > 1:
             sample = args[1]
@@ -136,7 +104,7 @@ def UniPCMultistepScheduler_multistep_uni_p_bh_update(
     elif self.config.solver_type == "bh2":
         B_h = torch.expm1(hh)
     else:
-        raise NotImplementedError()
+        raise NotImplementedError
 
     for i in range(1, order + 1):
         R.append(torch.pow(rks, i - 1))
@@ -147,6 +115,7 @@ def UniPCMultistepScheduler_multistep_uni_p_bh_update(
     R = torch.stack(R)
     b = torch.tensor(b, device=device)
 
+    rhos_p = None
     if len(D1s) > 0:
         D1s = torch.stack(D1s, dim=1)  # (B, K)
         # for order 2, we use a simplified version
@@ -179,34 +148,15 @@ def UniPCMultistepScheduler_multistep_uni_p_bh_update(
 diffusers.UniPCMultistepScheduler.multistep_uni_p_bh_update = UniPCMultistepScheduler_multistep_uni_p_bh_update
 
 
+# copied from diffusers.LCMScheduler.step
 def LCMScheduler_step(
-        self,
+        self: diffusers.LCMScheduler,
         model_output: torch.FloatTensor,
         timestep: int,
         sample: torch.FloatTensor,
         generator: Optional[torch.Generator] = None,
         return_dict: bool = True,
     ) -> Union[diffusers.schedulers.scheduling_lcm.LCMSchedulerOutput, Tuple]:
-    """
-    Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
-    process from the learned model outputs (most often the predicted noise).
-
-    Args:
-        model_output (`torch.FloatTensor`):
-            The direct output from learned diffusion model.
-        timestep (`float`):
-            The current discrete timestep in the diffusion chain.
-        sample (`torch.FloatTensor`):
-            A current instance of a sample created by the diffusion process.
-        generator (`torch.Generator`, *optional*):
-            A random number generator.
-        return_dict (`bool`, *optional*, defaults to `True`):
-            Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
-    Returns:
-        [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
-            If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
-            tuple is returned where the first element is the sample tensor.
-    """
     if self.num_inference_steps is None:
         raise ValueError(
             "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
diff --git a/modules/dml/hijack/stablediffusion.py b/modules/dml/hijack/stablediffusion.py
index 818250181..fc2518aa7 100644
--- a/modules/dml/hijack/stablediffusion.py
+++ b/modules/dml/hijack/stablediffusion.py
@@ -68,7 +68,7 @@ def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=F
         pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
 
     if dynamic_threshold is not None:
-        raise NotImplementedError()
+        raise NotImplementedError
 
     # direction pointing to x_t
     dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
diff --git a/modules/zluda.py b/modules/zluda.py
index 9737d2493..71f870267 100644
--- a/modules/zluda.py
+++ b/modules/zluda.py
@@ -3,6 +3,7 @@ import sys
 from typing import Union
 import torch
 from torch._prims_common import DeviceLikeType
+import onnxruntime as ort
 from modules import shared, devices
 
 
@@ -57,8 +58,14 @@ def initialize_zluda():
         if hasattr(torch.backends.cuda, "enable_cudnn_sdp"):
             torch.backends.cuda.enable_cudnn_sdp(False)
             torch.backends.cuda.enable_cudnn_sdp = do_nothing
-
         shared.opts.sdp_options = ['Math attention']
+
+        # ONNX Runtime is not supported
+        ort.capi._pybind_state.get_available_providers = lambda: [v for v in ort.get_available_providers() if v != 'CUDAExecutionProvider'] # pylint: disable=protected-access
+        ort.get_available_providers = ort.capi._pybind_state.get_available_providers # pylint: disable=protected-access
+        if shared.opts.onnx_execution_provider == 'CUDAExecutionProvider':
+            shared.opts.onnx_execution_provider = 'CPUExecutionProvider'
+
         devices.device_codeformer = devices.cpu
 
         result = test(device)