ROCm and Zluda don't fallback to CPU and clenup strings

2025-09-27 11:32:46 +03:00 · 2025-09-27 11:32:46 +03:00 · 71fde8a897
parent 579b1f3175
commit 71fde8a897
6 changed files with 26 additions and 45 deletions
--- a/installer.py
+++ b/installer.py
@ -677,18 +677,9 @@ def install_rocm_zluda():
    if args.skip_all or args.skip_requirements:
        return torch_command
    from modules import rocm
-    if rocm.err is not None:
-        log.warning(f'ROCm: error checking ROCm toolkit: {rocm.err}')
-        log.info('Using CPU-only torch')
-        return os.environ.get('TORCH_COMMAND', 'torch torchvision')
-    if not rocm.is_installed:
-        log.warning('ROCm: could not find ROCm toolkit installed')
-        log.info('Using CPU-only torch')
-        return os.environ.get('TORCH_COMMAND', 'torch torchvision')

    log.info('ROCm: AMD toolkit detected')
-    # if not is_windows:
-    #    os.environ.setdefault('TENSORFLOW_PACKAGE', 'tensorflow-rocm')
+    #os.environ.setdefault('TENSORFLOW_PACKAGE', 'tensorflow')

    device = None
    try:
@ -701,8 +692,6 @@ def install_rocm_zluda():
                index = 0
                for idx, gpu in enumerate(amd_gpus):
                    index = idx
-                    # if gpu.name.startswith('gfx11') and os.environ.get('TENSORFLOW_PACKAGE') == 'tensorflow-rocm': # do not use tensorflow-rocm for navi 3x
-                    #    os.environ['TENSORFLOW_PACKAGE'] = 'tensorflow==2.13.0'
                    if not gpu.is_apu:
                        # although apu was found, there can be a dedicated card. do not break loop.
                        # if no dedicated card was found, apu will be used.
@ -722,22 +711,23 @@ def install_rocm_zluda():
    log.info(msg)

    if sys.platform == "win32":
-        #check_python(supported_minors=[10, 11, 12, 13], reason='ZLUDA backend requires a Python version between 3.10 and 3.13')
-
        if args.use_rocm: # TODO install: switch to pytorch source when it becomes available
-            if isinstance(rocm.environment, rocm.PythonPackageEnvironment): # TheRock
+            if device is not None and isinstance(rocm.environment, rocm.PythonPackageEnvironment): # TheRock
+                check_python(supported_minors=[11, 12, 13], reason='ROCm backend requires a Python version between 3.11 and 3.13')
                torch_command = os.environ.get('TORCH_COMMAND', f'torch torchvision --index-url https://rocm.nightlies.amd.com/v2-staging/{rocm.get_distribution(device)}')
            else:
-                check_python(supported_minors=[12], reason='AMD Windows preview requires a Python version 3.12')
+                check_python(supported_minors=[12], reason='ROCm Windows preview requires Python version 3.12')
                torch_command = os.environ.get('TORCH_COMMAND', '--no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-6.4.4/torch-2.8.0a0%2Bgitfc14c65-cp312-cp312-win_amd64.whl https://repo.radeon.com/rocm/windows/rocm-rel-6.4.4/torchvision-0.24.0a0%2Bc85f008-cp312-cp312-win_amd64.whl')
        else:
+            #check_python(supported_minors=[10, 11, 12, 13], reason='ZLUDA backend requires a Python version between 3.10 and 3.13')
+            torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.7.1+cu118 torchvision==0.22.1+cu118 --index-url https://download.pytorch.org/whl/cu118')
+
            if args.device_id is not None:
                if os.environ.get('HIP_VISIBLE_DEVICES', None) is not None:
                    log.warning('Setting HIP_VISIBLE_DEVICES and --device-id at the same time may be mistake.')
                os.environ['HIP_VISIBLE_DEVICES'] = args.device_id
                del args.device_id

-            error = None
            from modules import zluda_installer
            try:
                if args.reinstall or zluda_installer.is_reinstall_needed():
@ -745,19 +735,12 @@ def install_rocm_zluda():
                zluda_installer.install()
                zluda_installer.set_default_agent(device)
            except Exception as e:
-                error = e
                log.warning(f'Failed to install ZLUDA: {e}')

-            if error is None:
-                try:
-                    zluda_installer.load()
-                    torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.7.1+cu118 torchvision==0.22.1+cu118 --index-url https://download.pytorch.org/whl/cu118')
-                except Exception as e:
-                    error = e
-                    log.warning(f'Failed to load ZLUDA: {e}')
-            if error is not None:
-                log.info('Using CPU-only torch')
-                torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision')
+            try:
+                zluda_installer.load()
+            except Exception as e:
+                log.warning(f'Failed to load ZLUDA: {e}')
    else:
        #check_python(supported_minors=[10, 11, 12, 13], reason='ROCm backend requires a Python version between 3.10 and 3.13')

@ -793,7 +776,7 @@ def install_rocm_zluda():
        log.info(f'ROCm: HSA_OVERRIDE_GFX_VERSION auto config skipped: device={device.name if device is not None else None} version={os.environ.get("HSA_OVERRIDE_GFX_VERSION", None)}')
    else:
        gfx_ver = device.get_gfx_version()
-        if gfx_ver is not None:
+        if gfx_ver is not None and device.name.removeprefix("gfx") != gfx_ver.replace(".", ""):
            os.environ.setdefault('HSA_OVERRIDE_GFX_VERSION', gfx_ver)
            log.info(f'ROCm: HSA_OVERRIDE_GFX_VERSION config overridden: device={device.name} version={os.environ.get("HSA_OVERRIDE_GFX_VERSION", None)}')

@ -936,8 +919,8 @@ def check_torch():
    if torch_command != '':
        pass
    else:
-        is_cuda_available = allow_cuda and (shutil.which('nvidia-smi') is not None or args.use_xformers or os.path.exists(os.path.join(os.environ.get('SystemRoot') or r'C:\Windows', 'System32', 'nvidia-smi.exe')))
-        is_rocm_available = allow_rocm and rocm.is_installed
+        is_cuda_available = allow_cuda and (args.use_cuda or shutil.which('nvidia-smi') is not None or args.use_xformers or os.path.exists(os.path.join(os.environ.get('SystemRoot') or r'C:\Windows', 'System32', 'nvidia-smi.exe')))
+        is_rocm_available = allow_rocm and (args.use_rocm or args.use_zluda or rocm.is_installed)
        is_ipex_available = allow_ipex and (args.use_ipex or shutil.which('sycl-ls') is not None or shutil.which('sycl-ls.exe') is not None or os.environ.get('ONEAPI_ROOT') is not None or os.path.exists('/opt/intel/oneapi') or os.path.exists("C:/Program Files (x86)/Intel/oneAPI") or os.path.exists("C:/oneAPI"))

        if is_cuda_available and args.use_cuda: # prioritize cuda
@ -965,8 +948,6 @@ def check_torch():
                    install(torch_command, 'torch torchvision')
                install('onnxruntime-directml', 'onnxruntime-directml', ignore=True)
            else:
-                if args.use_zluda:
-                    log.warning("ZLUDA failed to initialize: no HIP SDK found")
                log.warning('Torch: CPU-only version installed')
                torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision')
    if 'torch' in torch_command and not args.version:
--- a/modules/intel/ipex/init.py
+++ b/modules/intel/ipex/init.py
@ -165,7 +165,7 @@ def ipex_init(): # pylint: disable=too-many-statements
                        pass

            # Memory:
-            if 'linux' in sys.platform and "WSL2" in os.popen("uname -a").read():
+            if "linux" in sys.platform and "WSL2" in os.popen("uname -a").read():
                torch.xpu.empty_cache = lambda: None
            torch.cuda.empty_cache = torch.xpu.empty_cache

--- a/modules/intel/ipex/attention.py
+++ b/modules/intel/ipex/attention.py
@ -8,8 +8,8 @@ from functools import cache, wraps

 # ARC GPUs can't allocate more than 4GB to a single block so we slice the attention layers

-dynamic_attention_slice_rate = float(os.environ.get('IPEX_SDPA_SLICE_TRIGGER_RATE', 1))
-dynamic_attention_trigger_rate = float(os.environ.get('IPEX_ATTENTION_SLICE_RATE', 0.5))
+dynamic_attention_slice_rate = float(os.environ.get("IPEX_SDPA_SLICE_TRIGGER_RATE", "1"))
+dynamic_attention_trigger_rate = float(os.environ.get("IPEX_ATTENTION_SLICE_RATE", "0.5"))

 # Find something divisible with the input_tokens
@cache
--- a/modules/intel/ipex/hijacks.py
+++ b/modules/intel/ipex/hijacks.py
@ -80,8 +80,8 @@ def torch_get_autocast_dtype(device_type=None):
 # IPEX 2.5 and above has partial support but doesn't really work most of the time.
 original_interpolate = torch.nn.functional.interpolate
@wraps(torch.nn.functional.interpolate)
-def interpolate(tensor, size=None, scale_factor=None, mode='nearest', align_corners=None, recompute_scale_factor=None, antialias=False): # pylint: disable=too-many-arguments
-    if mode in {'bicubic', 'bilinear'}:
+def interpolate(tensor, size=None, scale_factor=None, mode="nearest", align_corners=None, recompute_scale_factor=None, antialias=False): # pylint: disable=too-many-arguments
+    if mode in {"bicubic", "bilinear"}:
        return_device = tensor.device
        return_dtype = tensor.dtype
        return original_interpolate(tensor.to("cpu", dtype=torch.float32), size=size, scale_factor=scale_factor, mode=mode,
@ -94,8 +94,8 @@ def interpolate(tensor, size=None, scale_factor=None, mode='nearest', align_corn
 # SwinIR BF16:
 original_functional_pad = torch.nn.functional.pad
@wraps(torch.nn.functional.pad)
-def functional_pad(input, pad, mode='constant', value=None):
-    if mode == 'reflect' and input.dtype == torch.bfloat16:
+def functional_pad(input, pad, mode="constant", value=None):
+    if mode == "reflect" and input.dtype == torch.bfloat16:
        return original_functional_pad(input.to(torch.float32), pad, mode=mode, value=value).to(dtype=torch.bfloat16)
    else:
        return original_functional_pad(input, pad, mode=mode, value=value)
@ -365,13 +365,13 @@ def ipex_hijacks():
    except Exception:
        pass

-    if os.environ.get('IPEX_FORCE_ATTENTION_SLICE', '0') == '0':
+    if os.environ.get("IPEX_FORCE_ATTENTION_SLICE", "0") == "0":
        if torch_version[0] > 2 or (torch_version[0] == 2 and torch_version[1] >= 7):
            use_dynamic_attention = False # torch 2.7 has flash atten support
        else:
            use_dynamic_attention = True
    else:
-        use_dynamic_attention = bool(os.environ.get('IPEX_FORCE_ATTENTION_SLICE', '0') == '1')
+        use_dynamic_attention = bool(os.environ.get("IPEX_FORCE_ATTENTION_SLICE", "0") == "1")

    if use_dynamic_attention:
        from .attention import dynamic_scaled_dot_product_attention
--- a/modules/sdnq/init.py
+++ b/modules/sdnq/init.py
@ -404,7 +404,7 @@ class SDNQQuantizer(DiffusersQuantizer):
    def _process_model_after_weight_loading(self, model, **kwargs): # pylint: disable=unused-argument
        if shared.opts.diffusers_offload_mode != "none":
            model = model.to(devices.cpu)
-        devices.torch_gc(force=True, reason='sdnq')
+        devices.torch_gc(force=True, reason="sdnq")
        return model

    def get_accelerator_warm_up_factor(self):
@ -440,7 +440,7 @@ class SDNQQuantizer(DiffusersQuantizer):
        """
        return missing_keys

-    def update_state_dict_with_metadata(self, state_dict: dict, metadata: dict) -> dict: # pylint: disable=unused-argument
+    def update_state_dict_with_metadata(self, state_dict: dict, metadata: dict) -> dict:
        """
        needed for transformers compatibilty, no-op function
        """
--- a/modules/sdnq/common.py
+++ b/modules/sdnq/common.py
@ -34,7 +34,7 @@ if hasattr(torch, "float8_e5m2fnuz"):
    dtype_dict["float8_e5m2fnuz"] = {"min": -57344, "max": 57344, "num_bits": 8, "target_dtype": "fp8", "torch_dtype": torch.float8_e5m2fnuz, "storage_dtype": torch.float8_e5m2fnuz, "is_unsigned": False, "is_integer": False}

 use_torch_compile = shared.opts.sdnq_dequantize_compile # this setting requires a full restart of the webui to apply
-use_tensorwise_fp8_matmul = os.environ.get('SDNQ_USE_TENSORWISE_FP8_MATMUL', "1").lower() not in {"0", "false", "no"} # row-wise FP8 only exist on H100 hardware, sdnq will use software row-wise with tensorwise hardware with this setting
+use_tensorwise_fp8_matmul = os.environ.get("SDNQ_USE_TENSORWISE_FP8_MATMUL", "1").lower() not in {"0", "false", "no"} # row-wise FP8 only exist on H100 hardware, sdnq will use software row-wise with tensorwise hardware with this setting

 linear_types = ("Linear",)
 conv_types = ("Conv1d", "Conv2d", "Conv3d")