ROCm and Zluda don't fallback to CPU and clenup strings

pull/4231/head
Disty0 2025-09-27 11:32:46 +03:00
parent 579b1f3175
commit 71fde8a897
6 changed files with 26 additions and 45 deletions

View File

@ -677,18 +677,9 @@ def install_rocm_zluda():
if args.skip_all or args.skip_requirements:
return torch_command
from modules import rocm
if rocm.err is not None:
log.warning(f'ROCm: error checking ROCm toolkit: {rocm.err}')
log.info('Using CPU-only torch')
return os.environ.get('TORCH_COMMAND', 'torch torchvision')
if not rocm.is_installed:
log.warning('ROCm: could not find ROCm toolkit installed')
log.info('Using CPU-only torch')
return os.environ.get('TORCH_COMMAND', 'torch torchvision')
log.info('ROCm: AMD toolkit detected')
# if not is_windows:
# os.environ.setdefault('TENSORFLOW_PACKAGE', 'tensorflow-rocm')
#os.environ.setdefault('TENSORFLOW_PACKAGE', 'tensorflow')
device = None
try:
@ -701,8 +692,6 @@ def install_rocm_zluda():
index = 0
for idx, gpu in enumerate(amd_gpus):
index = idx
# if gpu.name.startswith('gfx11') and os.environ.get('TENSORFLOW_PACKAGE') == 'tensorflow-rocm': # do not use tensorflow-rocm for navi 3x
# os.environ['TENSORFLOW_PACKAGE'] = 'tensorflow==2.13.0'
if not gpu.is_apu:
# although apu was found, there can be a dedicated card. do not break loop.
# if no dedicated card was found, apu will be used.
@ -722,22 +711,23 @@ def install_rocm_zluda():
log.info(msg)
if sys.platform == "win32":
#check_python(supported_minors=[10, 11, 12, 13], reason='ZLUDA backend requires a Python version between 3.10 and 3.13')
if args.use_rocm: # TODO install: switch to pytorch source when it becomes available
if isinstance(rocm.environment, rocm.PythonPackageEnvironment): # TheRock
if device is not None and isinstance(rocm.environment, rocm.PythonPackageEnvironment): # TheRock
check_python(supported_minors=[11, 12, 13], reason='ROCm backend requires a Python version between 3.11 and 3.13')
torch_command = os.environ.get('TORCH_COMMAND', f'torch torchvision --index-url https://rocm.nightlies.amd.com/v2-staging/{rocm.get_distribution(device)}')
else:
check_python(supported_minors=[12], reason='AMD Windows preview requires a Python version 3.12')
check_python(supported_minors=[12], reason='ROCm Windows preview requires Python version 3.12')
torch_command = os.environ.get('TORCH_COMMAND', '--no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-6.4.4/torch-2.8.0a0%2Bgitfc14c65-cp312-cp312-win_amd64.whl https://repo.radeon.com/rocm/windows/rocm-rel-6.4.4/torchvision-0.24.0a0%2Bc85f008-cp312-cp312-win_amd64.whl')
else:
#check_python(supported_minors=[10, 11, 12, 13], reason='ZLUDA backend requires a Python version between 3.10 and 3.13')
torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.7.1+cu118 torchvision==0.22.1+cu118 --index-url https://download.pytorch.org/whl/cu118')
if args.device_id is not None:
if os.environ.get('HIP_VISIBLE_DEVICES', None) is not None:
log.warning('Setting HIP_VISIBLE_DEVICES and --device-id at the same time may be mistake.')
os.environ['HIP_VISIBLE_DEVICES'] = args.device_id
del args.device_id
error = None
from modules import zluda_installer
try:
if args.reinstall or zluda_installer.is_reinstall_needed():
@ -745,19 +735,12 @@ def install_rocm_zluda():
zluda_installer.install()
zluda_installer.set_default_agent(device)
except Exception as e:
error = e
log.warning(f'Failed to install ZLUDA: {e}')
if error is None:
try:
zluda_installer.load()
torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.7.1+cu118 torchvision==0.22.1+cu118 --index-url https://download.pytorch.org/whl/cu118')
except Exception as e:
error = e
log.warning(f'Failed to load ZLUDA: {e}')
if error is not None:
log.info('Using CPU-only torch')
torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision')
try:
zluda_installer.load()
except Exception as e:
log.warning(f'Failed to load ZLUDA: {e}')
else:
#check_python(supported_minors=[10, 11, 12, 13], reason='ROCm backend requires a Python version between 3.10 and 3.13')
@ -793,7 +776,7 @@ def install_rocm_zluda():
log.info(f'ROCm: HSA_OVERRIDE_GFX_VERSION auto config skipped: device={device.name if device is not None else None} version={os.environ.get("HSA_OVERRIDE_GFX_VERSION", None)}')
else:
gfx_ver = device.get_gfx_version()
if gfx_ver is not None:
if gfx_ver is not None and device.name.removeprefix("gfx") != gfx_ver.replace(".", ""):
os.environ.setdefault('HSA_OVERRIDE_GFX_VERSION', gfx_ver)
log.info(f'ROCm: HSA_OVERRIDE_GFX_VERSION config overridden: device={device.name} version={os.environ.get("HSA_OVERRIDE_GFX_VERSION", None)}')
@ -936,8 +919,8 @@ def check_torch():
if torch_command != '':
pass
else:
is_cuda_available = allow_cuda and (shutil.which('nvidia-smi') is not None or args.use_xformers or os.path.exists(os.path.join(os.environ.get('SystemRoot') or r'C:\Windows', 'System32', 'nvidia-smi.exe')))
is_rocm_available = allow_rocm and rocm.is_installed
is_cuda_available = allow_cuda and (args.use_cuda or shutil.which('nvidia-smi') is not None or args.use_xformers or os.path.exists(os.path.join(os.environ.get('SystemRoot') or r'C:\Windows', 'System32', 'nvidia-smi.exe')))
is_rocm_available = allow_rocm and (args.use_rocm or args.use_zluda or rocm.is_installed)
is_ipex_available = allow_ipex and (args.use_ipex or shutil.which('sycl-ls') is not None or shutil.which('sycl-ls.exe') is not None or os.environ.get('ONEAPI_ROOT') is not None or os.path.exists('/opt/intel/oneapi') or os.path.exists("C:/Program Files (x86)/Intel/oneAPI") or os.path.exists("C:/oneAPI"))
if is_cuda_available and args.use_cuda: # prioritize cuda
@ -965,8 +948,6 @@ def check_torch():
install(torch_command, 'torch torchvision')
install('onnxruntime-directml', 'onnxruntime-directml', ignore=True)
else:
if args.use_zluda:
log.warning("ZLUDA failed to initialize: no HIP SDK found")
log.warning('Torch: CPU-only version installed')
torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision')
if 'torch' in torch_command and not args.version:

View File

@ -165,7 +165,7 @@ def ipex_init(): # pylint: disable=too-many-statements
pass
# Memory:
if 'linux' in sys.platform and "WSL2" in os.popen("uname -a").read():
if "linux" in sys.platform and "WSL2" in os.popen("uname -a").read():
torch.xpu.empty_cache = lambda: None
torch.cuda.empty_cache = torch.xpu.empty_cache

View File

@ -8,8 +8,8 @@ from functools import cache, wraps
# ARC GPUs can't allocate more than 4GB to a single block so we slice the attention layers
dynamic_attention_slice_rate = float(os.environ.get('IPEX_SDPA_SLICE_TRIGGER_RATE', 1))
dynamic_attention_trigger_rate = float(os.environ.get('IPEX_ATTENTION_SLICE_RATE', 0.5))
dynamic_attention_slice_rate = float(os.environ.get("IPEX_SDPA_SLICE_TRIGGER_RATE", "1"))
dynamic_attention_trigger_rate = float(os.environ.get("IPEX_ATTENTION_SLICE_RATE", "0.5"))
# Find something divisible with the input_tokens
@cache

View File

@ -80,8 +80,8 @@ def torch_get_autocast_dtype(device_type=None):
# IPEX 2.5 and above has partial support but doesn't really work most of the time.
original_interpolate = torch.nn.functional.interpolate
@wraps(torch.nn.functional.interpolate)
def interpolate(tensor, size=None, scale_factor=None, mode='nearest', align_corners=None, recompute_scale_factor=None, antialias=False): # pylint: disable=too-many-arguments
if mode in {'bicubic', 'bilinear'}:
def interpolate(tensor, size=None, scale_factor=None, mode="nearest", align_corners=None, recompute_scale_factor=None, antialias=False): # pylint: disable=too-many-arguments
if mode in {"bicubic", "bilinear"}:
return_device = tensor.device
return_dtype = tensor.dtype
return original_interpolate(tensor.to("cpu", dtype=torch.float32), size=size, scale_factor=scale_factor, mode=mode,
@ -94,8 +94,8 @@ def interpolate(tensor, size=None, scale_factor=None, mode='nearest', align_corn
# SwinIR BF16:
original_functional_pad = torch.nn.functional.pad
@wraps(torch.nn.functional.pad)
def functional_pad(input, pad, mode='constant', value=None):
if mode == 'reflect' and input.dtype == torch.bfloat16:
def functional_pad(input, pad, mode="constant", value=None):
if mode == "reflect" and input.dtype == torch.bfloat16:
return original_functional_pad(input.to(torch.float32), pad, mode=mode, value=value).to(dtype=torch.bfloat16)
else:
return original_functional_pad(input, pad, mode=mode, value=value)
@ -365,13 +365,13 @@ def ipex_hijacks():
except Exception:
pass
if os.environ.get('IPEX_FORCE_ATTENTION_SLICE', '0') == '0':
if os.environ.get("IPEX_FORCE_ATTENTION_SLICE", "0") == "0":
if torch_version[0] > 2 or (torch_version[0] == 2 and torch_version[1] >= 7):
use_dynamic_attention = False # torch 2.7 has flash atten support
else:
use_dynamic_attention = True
else:
use_dynamic_attention = bool(os.environ.get('IPEX_FORCE_ATTENTION_SLICE', '0') == '1')
use_dynamic_attention = bool(os.environ.get("IPEX_FORCE_ATTENTION_SLICE", "0") == "1")
if use_dynamic_attention:
from .attention import dynamic_scaled_dot_product_attention

View File

@ -404,7 +404,7 @@ class SDNQQuantizer(DiffusersQuantizer):
def _process_model_after_weight_loading(self, model, **kwargs): # pylint: disable=unused-argument
if shared.opts.diffusers_offload_mode != "none":
model = model.to(devices.cpu)
devices.torch_gc(force=True, reason='sdnq')
devices.torch_gc(force=True, reason="sdnq")
return model
def get_accelerator_warm_up_factor(self):
@ -440,7 +440,7 @@ class SDNQQuantizer(DiffusersQuantizer):
"""
return missing_keys
def update_state_dict_with_metadata(self, state_dict: dict, metadata: dict) -> dict: # pylint: disable=unused-argument
def update_state_dict_with_metadata(self, state_dict: dict, metadata: dict) -> dict:
"""
needed for transformers compatibilty, no-op function
"""

View File

@ -34,7 +34,7 @@ if hasattr(torch, "float8_e5m2fnuz"):
dtype_dict["float8_e5m2fnuz"] = {"min": -57344, "max": 57344, "num_bits": 8, "target_dtype": "fp8", "torch_dtype": torch.float8_e5m2fnuz, "storage_dtype": torch.float8_e5m2fnuz, "is_unsigned": False, "is_integer": False}
use_torch_compile = shared.opts.sdnq_dequantize_compile # this setting requires a full restart of the webui to apply
use_tensorwise_fp8_matmul = os.environ.get('SDNQ_USE_TENSORWISE_FP8_MATMUL', "1").lower() not in {"0", "false", "no"} # row-wise FP8 only exist on H100 hardware, sdnq will use software row-wise with tensorwise hardware with this setting
use_tensorwise_fp8_matmul = os.environ.get("SDNQ_USE_TENSORWISE_FP8_MATMUL", "1").lower() not in {"0", "false", "no"} # row-wise FP8 only exist on H100 hardware, sdnq will use software row-wise with tensorwise hardware with this setting
linear_types = ("Linear",)
conv_types = ("Conv1d", "Conv2d", "Conv3d")