diff --git a/installer.py b/installer.py index 1bbb03719..22172970b 100644 --- a/installer.py +++ b/installer.py @@ -677,18 +677,9 @@ def install_rocm_zluda(): if args.skip_all or args.skip_requirements: return torch_command from modules import rocm - if rocm.err is not None: - log.warning(f'ROCm: error checking ROCm toolkit: {rocm.err}') - log.info('Using CPU-only torch') - return os.environ.get('TORCH_COMMAND', 'torch torchvision') - if not rocm.is_installed: - log.warning('ROCm: could not find ROCm toolkit installed') - log.info('Using CPU-only torch') - return os.environ.get('TORCH_COMMAND', 'torch torchvision') log.info('ROCm: AMD toolkit detected') - # if not is_windows: - # os.environ.setdefault('TENSORFLOW_PACKAGE', 'tensorflow-rocm') + #os.environ.setdefault('TENSORFLOW_PACKAGE', 'tensorflow') device = None try: @@ -701,8 +692,6 @@ def install_rocm_zluda(): index = 0 for idx, gpu in enumerate(amd_gpus): index = idx - # if gpu.name.startswith('gfx11') and os.environ.get('TENSORFLOW_PACKAGE') == 'tensorflow-rocm': # do not use tensorflow-rocm for navi 3x - # os.environ['TENSORFLOW_PACKAGE'] = 'tensorflow==2.13.0' if not gpu.is_apu: # although apu was found, there can be a dedicated card. do not break loop. # if no dedicated card was found, apu will be used. @@ -722,22 +711,23 @@ def install_rocm_zluda(): log.info(msg) if sys.platform == "win32": - #check_python(supported_minors=[10, 11, 12, 13], reason='ZLUDA backend requires a Python version between 3.10 and 3.13') - if args.use_rocm: # TODO install: switch to pytorch source when it becomes available - if isinstance(rocm.environment, rocm.PythonPackageEnvironment): # TheRock + if device is not None and isinstance(rocm.environment, rocm.PythonPackageEnvironment): # TheRock + check_python(supported_minors=[11, 12, 13], reason='ROCm backend requires a Python version between 3.11 and 3.13') torch_command = os.environ.get('TORCH_COMMAND', f'torch torchvision --index-url https://rocm.nightlies.amd.com/v2-staging/{rocm.get_distribution(device)}') else: - check_python(supported_minors=[12], reason='AMD Windows preview requires a Python version 3.12') + check_python(supported_minors=[12], reason='ROCm Windows preview requires Python version 3.12') torch_command = os.environ.get('TORCH_COMMAND', '--no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-6.4.4/torch-2.8.0a0%2Bgitfc14c65-cp312-cp312-win_amd64.whl https://repo.radeon.com/rocm/windows/rocm-rel-6.4.4/torchvision-0.24.0a0%2Bc85f008-cp312-cp312-win_amd64.whl') else: + #check_python(supported_minors=[10, 11, 12, 13], reason='ZLUDA backend requires a Python version between 3.10 and 3.13') + torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.7.1+cu118 torchvision==0.22.1+cu118 --index-url https://download.pytorch.org/whl/cu118') + if args.device_id is not None: if os.environ.get('HIP_VISIBLE_DEVICES', None) is not None: log.warning('Setting HIP_VISIBLE_DEVICES and --device-id at the same time may be mistake.') os.environ['HIP_VISIBLE_DEVICES'] = args.device_id del args.device_id - error = None from modules import zluda_installer try: if args.reinstall or zluda_installer.is_reinstall_needed(): @@ -745,19 +735,12 @@ def install_rocm_zluda(): zluda_installer.install() zluda_installer.set_default_agent(device) except Exception as e: - error = e log.warning(f'Failed to install ZLUDA: {e}') - if error is None: - try: - zluda_installer.load() - torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.7.1+cu118 torchvision==0.22.1+cu118 --index-url https://download.pytorch.org/whl/cu118') - except Exception as e: - error = e - log.warning(f'Failed to load ZLUDA: {e}') - if error is not None: - log.info('Using CPU-only torch') - torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision') + try: + zluda_installer.load() + except Exception as e: + log.warning(f'Failed to load ZLUDA: {e}') else: #check_python(supported_minors=[10, 11, 12, 13], reason='ROCm backend requires a Python version between 3.10 and 3.13') @@ -793,7 +776,7 @@ def install_rocm_zluda(): log.info(f'ROCm: HSA_OVERRIDE_GFX_VERSION auto config skipped: device={device.name if device is not None else None} version={os.environ.get("HSA_OVERRIDE_GFX_VERSION", None)}') else: gfx_ver = device.get_gfx_version() - if gfx_ver is not None: + if gfx_ver is not None and device.name.removeprefix("gfx") != gfx_ver.replace(".", ""): os.environ.setdefault('HSA_OVERRIDE_GFX_VERSION', gfx_ver) log.info(f'ROCm: HSA_OVERRIDE_GFX_VERSION config overridden: device={device.name} version={os.environ.get("HSA_OVERRIDE_GFX_VERSION", None)}') @@ -936,8 +919,8 @@ def check_torch(): if torch_command != '': pass else: - is_cuda_available = allow_cuda and (shutil.which('nvidia-smi') is not None or args.use_xformers or os.path.exists(os.path.join(os.environ.get('SystemRoot') or r'C:\Windows', 'System32', 'nvidia-smi.exe'))) - is_rocm_available = allow_rocm and rocm.is_installed + is_cuda_available = allow_cuda and (args.use_cuda or shutil.which('nvidia-smi') is not None or args.use_xformers or os.path.exists(os.path.join(os.environ.get('SystemRoot') or r'C:\Windows', 'System32', 'nvidia-smi.exe'))) + is_rocm_available = allow_rocm and (args.use_rocm or args.use_zluda or rocm.is_installed) is_ipex_available = allow_ipex and (args.use_ipex or shutil.which('sycl-ls') is not None or shutil.which('sycl-ls.exe') is not None or os.environ.get('ONEAPI_ROOT') is not None or os.path.exists('/opt/intel/oneapi') or os.path.exists("C:/Program Files (x86)/Intel/oneAPI") or os.path.exists("C:/oneAPI")) if is_cuda_available and args.use_cuda: # prioritize cuda @@ -965,8 +948,6 @@ def check_torch(): install(torch_command, 'torch torchvision') install('onnxruntime-directml', 'onnxruntime-directml', ignore=True) else: - if args.use_zluda: - log.warning("ZLUDA failed to initialize: no HIP SDK found") log.warning('Torch: CPU-only version installed') torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision') if 'torch' in torch_command and not args.version: diff --git a/modules/intel/ipex/__init__.py b/modules/intel/ipex/__init__.py index 93f94d280..3d49beca7 100644 --- a/modules/intel/ipex/__init__.py +++ b/modules/intel/ipex/__init__.py @@ -165,7 +165,7 @@ def ipex_init(): # pylint: disable=too-many-statements pass # Memory: - if 'linux' in sys.platform and "WSL2" in os.popen("uname -a").read(): + if "linux" in sys.platform and "WSL2" in os.popen("uname -a").read(): torch.xpu.empty_cache = lambda: None torch.cuda.empty_cache = torch.xpu.empty_cache diff --git a/modules/intel/ipex/attention.py b/modules/intel/ipex/attention.py index c3e8ef8ed..aacccca75 100644 --- a/modules/intel/ipex/attention.py +++ b/modules/intel/ipex/attention.py @@ -8,8 +8,8 @@ from functools import cache, wraps # ARC GPUs can't allocate more than 4GB to a single block so we slice the attention layers -dynamic_attention_slice_rate = float(os.environ.get('IPEX_SDPA_SLICE_TRIGGER_RATE', 1)) -dynamic_attention_trigger_rate = float(os.environ.get('IPEX_ATTENTION_SLICE_RATE', 0.5)) +dynamic_attention_slice_rate = float(os.environ.get("IPEX_SDPA_SLICE_TRIGGER_RATE", "1")) +dynamic_attention_trigger_rate = float(os.environ.get("IPEX_ATTENTION_SLICE_RATE", "0.5")) # Find something divisible with the input_tokens @cache diff --git a/modules/intel/ipex/hijacks.py b/modules/intel/ipex/hijacks.py index c9be1f789..8e8961476 100644 --- a/modules/intel/ipex/hijacks.py +++ b/modules/intel/ipex/hijacks.py @@ -80,8 +80,8 @@ def torch_get_autocast_dtype(device_type=None): # IPEX 2.5 and above has partial support but doesn't really work most of the time. original_interpolate = torch.nn.functional.interpolate @wraps(torch.nn.functional.interpolate) -def interpolate(tensor, size=None, scale_factor=None, mode='nearest', align_corners=None, recompute_scale_factor=None, antialias=False): # pylint: disable=too-many-arguments - if mode in {'bicubic', 'bilinear'}: +def interpolate(tensor, size=None, scale_factor=None, mode="nearest", align_corners=None, recompute_scale_factor=None, antialias=False): # pylint: disable=too-many-arguments + if mode in {"bicubic", "bilinear"}: return_device = tensor.device return_dtype = tensor.dtype return original_interpolate(tensor.to("cpu", dtype=torch.float32), size=size, scale_factor=scale_factor, mode=mode, @@ -94,8 +94,8 @@ def interpolate(tensor, size=None, scale_factor=None, mode='nearest', align_corn # SwinIR BF16: original_functional_pad = torch.nn.functional.pad @wraps(torch.nn.functional.pad) -def functional_pad(input, pad, mode='constant', value=None): - if mode == 'reflect' and input.dtype == torch.bfloat16: +def functional_pad(input, pad, mode="constant", value=None): + if mode == "reflect" and input.dtype == torch.bfloat16: return original_functional_pad(input.to(torch.float32), pad, mode=mode, value=value).to(dtype=torch.bfloat16) else: return original_functional_pad(input, pad, mode=mode, value=value) @@ -365,13 +365,13 @@ def ipex_hijacks(): except Exception: pass - if os.environ.get('IPEX_FORCE_ATTENTION_SLICE', '0') == '0': + if os.environ.get("IPEX_FORCE_ATTENTION_SLICE", "0") == "0": if torch_version[0] > 2 or (torch_version[0] == 2 and torch_version[1] >= 7): use_dynamic_attention = False # torch 2.7 has flash atten support else: use_dynamic_attention = True else: - use_dynamic_attention = bool(os.environ.get('IPEX_FORCE_ATTENTION_SLICE', '0') == '1') + use_dynamic_attention = bool(os.environ.get("IPEX_FORCE_ATTENTION_SLICE", "0") == "1") if use_dynamic_attention: from .attention import dynamic_scaled_dot_product_attention diff --git a/modules/sdnq/__init__.py b/modules/sdnq/__init__.py index 6ba49cbb1..6b8e7c409 100644 --- a/modules/sdnq/__init__.py +++ b/modules/sdnq/__init__.py @@ -404,7 +404,7 @@ class SDNQQuantizer(DiffusersQuantizer): def _process_model_after_weight_loading(self, model, **kwargs): # pylint: disable=unused-argument if shared.opts.diffusers_offload_mode != "none": model = model.to(devices.cpu) - devices.torch_gc(force=True, reason='sdnq') + devices.torch_gc(force=True, reason="sdnq") return model def get_accelerator_warm_up_factor(self): @@ -440,7 +440,7 @@ class SDNQQuantizer(DiffusersQuantizer): """ return missing_keys - def update_state_dict_with_metadata(self, state_dict: dict, metadata: dict) -> dict: # pylint: disable=unused-argument + def update_state_dict_with_metadata(self, state_dict: dict, metadata: dict) -> dict: """ needed for transformers compatibilty, no-op function """ diff --git a/modules/sdnq/common.py b/modules/sdnq/common.py index e6ed21d4d..ecd06a653 100644 --- a/modules/sdnq/common.py +++ b/modules/sdnq/common.py @@ -34,7 +34,7 @@ if hasattr(torch, "float8_e5m2fnuz"): dtype_dict["float8_e5m2fnuz"] = {"min": -57344, "max": 57344, "num_bits": 8, "target_dtype": "fp8", "torch_dtype": torch.float8_e5m2fnuz, "storage_dtype": torch.float8_e5m2fnuz, "is_unsigned": False, "is_integer": False} use_torch_compile = shared.opts.sdnq_dequantize_compile # this setting requires a full restart of the webui to apply -use_tensorwise_fp8_matmul = os.environ.get('SDNQ_USE_TENSORWISE_FP8_MATMUL', "1").lower() not in {"0", "false", "no"} # row-wise FP8 only exist on H100 hardware, sdnq will use software row-wise with tensorwise hardware with this setting +use_tensorwise_fp8_matmul = os.environ.get("SDNQ_USE_TENSORWISE_FP8_MATMUL", "1").lower() not in {"0", "false", "no"} # row-wise FP8 only exist on H100 hardware, sdnq will use software row-wise with tensorwise hardware with this setting linear_types = ("Linear",) conv_types = ("Conv1d", "Conv2d", "Conv3d")