code quality and layout fixes

pull/4726/head
resonantsky 2026-04-03 10:12:05 +02:00
parent 4cafae9350
commit 01d53edb25
3 changed files with 114 additions and 115 deletions

View File

@ -29,17 +29,17 @@ CONFIG = Path(os.path.abspath(os.path.join('data', 'rocm.json')))
_cache: Optional[Dict[str, str]] = None # loaded once, invalidated on save _cache: Optional[Dict[str, str]] = None # loaded once, invalidated on save
# Metadata key written into rocm.json to record which architecture profile is active. # Metadata key written into rocm.json to record which architecture profile is active.
# Not an environment variable always skipped during env application but preserved in the # Not an environment variable - always skipped during env application but preserved in the
# saved config so that arch-safety enforcement is consistent across restarts. # saved config so that arch-safety enforcement is consistent across restarts.
_ARCH_KEY = "_rocm_arch" _ARCH_KEY = "_rocm_arch"
# Vars that must never appear in the process environment. # Vars that must never appear in the process environment.
# #
# _DTYPE_UNSAFE: alter FP16 inference dtype must be cleared regardless of config # _DTYPE_UNSAFE: alter FP16 inference dtype - must be cleared regardless of config
# MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP16_ALT_IMPL DEBUG alias: routes all FP16 convs through BF16 exponent math # MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP16_ALT_IMPL - DEBUG alias: routes all FP16 convs through BF16 exponent math
# MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL API-level alias: same BF16-exponent effect # MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL - API-level alias: same BF16-exponent effect
# MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_EXPEREMENTAL_FP16_TRANSFORM unstable experimental FP16 path # MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_EXPEREMENTAL_FP16_TRANSFORM - unstable experimental FP16 path
# MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_PK_ATOMIC_ADD_FP16 changes FP16 WrW atomic accumulation # MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_PK_ATOMIC_ADD_FP16 - changes FP16 WrW atomic accumulation
# #
# SOLVER_DISABLED_BY_DEFAULT: every solver known to be incompatible with this runtime # SOLVER_DISABLED_BY_DEFAULT: every solver known to be incompatible with this runtime
# (FP32-only, training-only WrW/BWD, fixed-geometry mismatches, XDLOPS/CDNA-only, arch-specific). # (FP32-only, training-only WrW/BWD, fixed-geometry mismatches, XDLOPS/CDNA-only, arch-specific).
@ -54,18 +54,18 @@ _DTYPE_UNSAFE = {
# regardless of saved config. Limited to dtype-corrupting vars only. # regardless of saved config. Limited to dtype-corrupting vars only.
# IMPORTANT: SOLVER_DISABLED_BY_DEFAULT is intentionally NOT included here. # IMPORTANT: SOLVER_DISABLED_BY_DEFAULT is intentionally NOT included here.
# When a solver var is absent (unset) MIOpen still calls IsApplicable() on every # When a solver var is absent (unset) MIOpen still calls IsApplicable() on every
# conv-find wasted probing overhead. When a var is explicitly "0" MIOpen skips # conv-find - wasted probing overhead. When a var is explicitly "0" MIOpen skips
# IsApplicable() immediately. Solver defaults flow through the config loop as "0" # IsApplicable() immediately. Solver defaults flow through the config loop as "0"
# (their ROCM_ENV_VARS default is "0") so they are explicitly set to "0" in the env. # (their ROCM_ENV_VARS default is "0") so they are explicitly set to "0" in the env.
_UNSET_VARS = _DTYPE_UNSAFE _UNSET_VARS = _DTYPE_UNSAFE
# Additional environment vars that must be removed from the process before MIOpen loads. # Additional environment vars that must be removed from the process before MIOpen loads.
# These are not MIOpen solver toggles but can corrupt MIOpen's runtime behaviour: # These are not MIOpen solver toggles but can corrupt MIOpen's runtime behaviour:
# HIP_PATH / HIP_PATH_71 point to the system AMD ROCm install; override the venv-bundled # HIP_PATH / HIP_PATH_71 - point to the system AMD ROCm install; override the venv-bundled
# _rocm_sdk_devel DLLs with a potentially mismatched system version # _rocm_sdk_devel DLLs with a potentially mismatched system version
# QML_*/QT_* QtQuick shader/disk-cache flags leaked from Qt tools; harmless for # QML_*/QT_* - QtQuick shader/disk-cache flags leaked from Qt tools; harmless for
# PyTorch but can conflict with Gradio's embedded Qt helpers # PyTorch but can conflict with Gradio's embedded Qt helpers
# PYENV_VIRTUALENV_DISABLE_PROMPT pyenv noise that confuses venv detection # PYENV_VIRTUALENV_DISABLE_PROMPT - pyenv noise that confuses venv detection
_EXTRA_CLEAR_VARS = { _EXTRA_CLEAR_VARS = {
"HIP_PATH", "HIP_PATH",
"HIP_PATH_71", "HIP_PATH_71",
@ -73,7 +73,7 @@ _EXTRA_CLEAR_VARS = {
"QML_DISABLE_DISK_CACHE", "QML_DISABLE_DISK_CACHE",
"QML_FORCE_DISK_CACHE", "QML_FORCE_DISK_CACHE",
"QT_DISABLE_SHADER_DISK_CACHE", "QT_DISABLE_SHADER_DISK_CACHE",
# PERF_VALS vars are NOT boolean toggles MIOpen reads them as perf-config strings. # PERF_VALS vars are NOT boolean toggles - MIOpen reads them as perf-config strings.
# If inherited from a parent shell with value "1", MIOpen's GetPerfConfFromEnv parses # If inherited from a parent shell with value "1", MIOpen's GetPerfConfFromEnv parses
# "1" as a degenerate config and can return dtype=float32 output from FP16 tensors. # "1" as a degenerate config and can return dtype=float32 output from FP16 tensors.
"MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_PERF_VALS", "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_PERF_VALS",
@ -82,12 +82,12 @@ _EXTRA_CLEAR_VARS = {
# Solvers whose MIOpen IsApplicable() explicitly rejects non-FP32 tensors. # Solvers whose MIOpen IsApplicable() explicitly rejects non-FP32 tensors.
# They are safe to leave enabled in FP32 mode. When the active dtype is FP16 or BF16 # They are safe to leave enabled in FP32 mode. When the active dtype is FP16 or BF16
# we force them OFF so MIOpen skips the IsApplicable probe entirely avoids overhead on # we force them OFF so MIOpen skips the IsApplicable probe entirely - avoids overhead on
# every conv shape find. These are NOT in _UNSET_VARS because they are valid in FP32. # every conv shape find. These are NOT in _UNSET_VARS because they are valid in FP32.
_FP32_ONLY_SOLVERS = { _FP32_ONLY_SOLVERS = {
"MIOPEN_DEBUG_CONV_FFT", # FFT convolution FP32 only (MIOpen source: IsFp32 check) "MIOPEN_DEBUG_CONV_FFT", # FFT convolution - FP32 only (MIOpen source: IsFp32 check)
"MIOPEN_DEBUG_AMD_WINOGRAD_3X3", # Winograd 3x3 FP32 only "MIOPEN_DEBUG_AMD_WINOGRAD_3X3", # Winograd 3x3 - FP32 only
"MIOPEN_DEBUG_AMD_FUSED_WINOGRAD", # Fused Winograd FP32 only "MIOPEN_DEBUG_AMD_FUSED_WINOGRAD", # Fused Winograd - FP32 only
} }
@ -172,7 +172,7 @@ def load_config() -> Dict[str, str]:
_cache = data if data else {k: v["default"] for k, v in ROCM_ENV_VARS.items()} _cache = data if data else {k: v["default"] for k, v in ROCM_ENV_VARS.items()}
# Purge unsafe vars from a stale saved config and re-persist only if the file existed. # Purge unsafe vars from a stale saved config and re-persist only if the file existed.
# When running without a saved config (first run / after Delete), load_config() must # When running without a saved config (first run / after Delete), load_config() must
# never create the file that only happens via save_config() on Apply or Apply Profile. # never create the file - that only happens via save_config() on Apply or Apply Profile.
dirty = {k for k in _cache if k in _UNSET_VARS or (k != _ARCH_KEY and k not in ROCM_ENV_VARS)} dirty = {k for k in _cache if k in _UNSET_VARS or (k != _ARCH_KEY and k not in ROCM_ENV_VARS)}
if dirty: if dirty:
_cache = {k: v for k, v in _cache.items() if k not in dirty} _cache = {k: v for k, v in _cache.items() if k not in dirty}
@ -221,7 +221,7 @@ def apply_env(config: Optional[Dict[str, str]] = None) -> None:
os.environ[var] = expanded os.environ[var] = expanded
# Arch safety net: hard-force all hardware-incompatible vars to "0" in the env. # Arch safety net: hard-force all hardware-incompatible vars to "0" in the env.
# This runs *after* the config loop so it overrides any stale "1" that survived in the JSON. # This runs *after* the config loop so it overrides any stale "1" that survived in the JSON.
# Source of truth: rocm_profiles.UNAVAILABLE[arch] vars with no supporting hardware. # Source of truth: rocm_profiles.UNAVAILABLE[arch] - vars with no supporting hardware.
arch = config.get(_ARCH_KEY, "") arch = config.get(_ARCH_KEY, "")
unavailable = rocm_profiles.UNAVAILABLE.get(arch, set()) unavailable = rocm_profiles.UNAVAILABLE.get(arch, set())
if unavailable: if unavailable:
@ -249,7 +249,7 @@ def apply_all(names: list, values: list) -> None:
meta = ROCM_ENV_VARS[name] meta = ROCM_ENV_VARS[name]
if meta["widget"] == "checkbox": if meta["widget"] == "checkbox":
if value is None: if value is None:
pass # Gradio passed None (component not interacted with) leave config unchanged pass # Gradio passed None (component not interacted with) - leave config unchanged
else: else:
config[name] = "1" if value else "0" config[name] = "1" if value else "0"
elif meta["widget"] == "radio": elif meta["widget"] == "radio":
@ -257,7 +257,7 @@ def apply_all(names: list, values: list) -> None:
valid = {v for _, v in meta["options"]} if meta["options"] and isinstance(meta["options"][0], tuple) else set(meta["options"] or []) valid = {v for _, v in meta["options"]} if meta["options"] and isinstance(meta["options"][0], tuple) else set(meta["options"] or [])
if stored in valid: if stored in valid:
config[name] = stored config[name] = stored
# else: value was None/invalid leave the existing saved value untouched # else: value was None/invalid - leave the existing saved value untouched
else: else:
if meta.get("options"): if meta.get("options"):
value = _dropdown_stored(str(value), meta["options"]) value = _dropdown_stored(str(value), meta["options"])
@ -300,7 +300,7 @@ def delete_config() -> None:
CONFIG.unlink() CONFIG.unlink()
log.info(f'ROCm delete_config: deleted {CONFIG}') log.info(f'ROCm delete_config: deleted {CONFIG}')
_cache = None _cache = None
# Delete the MIOpen user DB (~/.miopen/db) stale entries can cause solver mismatches # Delete the MIOpen user DB (~/.miopen/db) - stale entries can cause solver mismatches
miopen_db = Path(os.path.expanduser('~')) / '.miopen' / 'db' miopen_db = Path(os.path.expanduser('~')) / '.miopen' / 'db'
if miopen_db.exists(): if miopen_db.exists():
shutil.rmtree(miopen_db, ignore_errors=True) shutil.rmtree(miopen_db, ignore_errors=True)
@ -458,7 +458,7 @@ def info() -> dict:
if ufiles: if ufiles:
udb["files"] = ufiles udb["files"] = ufiles
# --- User cache (~/.miopen/cache/<version-hash>) --- # User cache (~/.miopen/cache/<version-hash>)
cache_base = Path.home() / ".miopen" / "cache" cache_base = Path.home() / ".miopen" / "cache"
db_hash = _extract_db_hash(user_db_path) if user_db_path.exists() else "" db_hash = _extract_db_hash(user_db_path) if user_db_path.exists() else ""
cache_path = cache_base / db_hash if db_hash else cache_base cache_path = cache_base / db_hash if db_hash else cache_base

View File

@ -1,4 +1,4 @@
""" """
Architecture-specific MIOpen solver profiles for AMD GCN/RDNA GPUs. Architecture-specific MIOpen solver profiles for AMD GCN/RDNA GPUs.
Sources: Sources:
@ -6,8 +6,8 @@ Sources:
Key axis: consumer RDNA GPUs have NO XDLOPS hardware (that's CDNA/Instinct only). Key axis: consumer RDNA GPUs have NO XDLOPS hardware (that's CDNA/Instinct only).
RDNA2 (gfx1030): RX 6000 series RDNA2 (gfx1030): RX 6000 series
RDNA3 (gfx1100): RX 7000 series adds Fury Winograd, wider MPASS RDNA3 (gfx1100): RX 7000 series - adds Fury Winograd, wider MPASS
RDNA4 (gfx1200): RX 9000 series adds Rage Winograd, wider MPASS RDNA4 (gfx1200): RX 9000 series - adds Rage Winograd, wider MPASS
Each profile is a dict of {var: value} that will be MERGED on top of the Each profile is a dict of {var: value} that will be MERGED on top of the
current config (general vars like DB path / log level are preserved). current config (general vars like DB path / log level are preserved).
@ -15,9 +15,9 @@ current config (general vars like DB path / log level are preserved).
from typing import Dict from typing import Dict
# ---------------------------------------------------------------------------
# Shared: everything that must be OFF on ALL consumer RDNA (no XDLOPS hw) # Shared: everything that must be OFF on ALL consumer RDNA (no XDLOPS hw)
# ---------------------------------------------------------------------------
_XDLOPS_OFF: Dict[str, str] = { _XDLOPS_OFF: Dict[str, str] = {
# GTC XDLOPS (CDNA-only) # GTC XDLOPS (CDNA-only)
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS": "0", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS": "0",
@ -55,7 +55,7 @@ _XDLOPS_OFF: Dict[str, str] = {
# MLIR (CDNA-only in practice) # MLIR (CDNA-only in practice)
"MIOPEN_DEBUG_CONV_MLIR_IGEMM_WRW_XDLOPS": "0", "MIOPEN_DEBUG_CONV_MLIR_IGEMM_WRW_XDLOPS": "0",
"MIOPEN_DEBUG_CONV_MLIR_IGEMM_BWD_XDLOPS": "0", "MIOPEN_DEBUG_CONV_MLIR_IGEMM_BWD_XDLOPS": "0",
# MP BD Winograd (Multi-pass Block-Decomposed CDNA / high-end only) # MP BD Winograd (Multi-pass Block-Decomposed - CDNA / high-end only)
"MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F2X3": "0", "MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F2X3": "0",
"MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F3X3": "0", "MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F3X3": "0",
"MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F4X3": "0", "MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F4X3": "0",
@ -68,17 +68,17 @@ _XDLOPS_OFF: Dict[str, str] = {
"MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F6X3": "0", "MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F6X3": "0",
} }
# ---------------------------------------------------------------------------
# RDNA2 gfx1030 (RX 6000 series) # RDNA2 - gfx1030 (RX 6000 series)
# No XDLOPS, no Fury/Rage Winograd, MPASS limited to F3x2/F3x3 # No XDLOPS, no Fury/Rage Winograd, MPASS limited to F3x2/F3x3
# ASM IGEMM: V4R1 variants only; HIP IGEMM: non-XDLOPS V4R1/R4 only # ASM IGEMM: V4R1 variants only; HIP IGEMM: non-XDLOPS V4R1/R4 only
# ---------------------------------------------------------------------------
RDNA2: Dict[str, str] = { RDNA2: Dict[str, str] = {
**_XDLOPS_OFF, **_XDLOPS_OFF,
# General settings (architecture-independent; set here so all profiles cover them) # General settings (architecture-independent; set here so all profiles cover them)
"MIOPEN_SEARCH_CUTOFF": "0", "MIOPEN_SEARCH_CUTOFF": "0",
"MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC": "0", "MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC": "0",
# Core algo enables FFT is FP32-only but harmless (IsApplicable rejects it for fp16 tensors) # Core algo enables - FFT is FP32-only but harmless (IsApplicable rejects it for fp16 tensors)
"MIOPEN_DEBUG_CONV_FFT": "1", "MIOPEN_DEBUG_CONV_FFT": "1",
"MIOPEN_DEBUG_CONV_DIRECT": "1", "MIOPEN_DEBUG_CONV_DIRECT": "1",
"MIOPEN_DEBUG_CONV_GEMM": "1", "MIOPEN_DEBUG_CONV_GEMM": "1",
@ -93,16 +93,16 @@ RDNA2: Dict[str, str] = {
"MIOPEN_DEBUG_OPENCL_CONVOLUTIONS": "1", "MIOPEN_DEBUG_OPENCL_CONVOLUTIONS": "1",
"MIOPEN_DEBUG_OPENCL_WAVE64_NOWGP": "1", "MIOPEN_DEBUG_OPENCL_WAVE64_NOWGP": "1",
"MIOPEN_DEBUG_ATTN_SOFTMAX": "1", "MIOPEN_DEBUG_ATTN_SOFTMAX": "1",
# Direct ASM dtype notes # Direct ASM - dtype notes
# 3X3U / 1X1U / 1X1UV2: FP32/FP16 forward enabled # 3X3U / 1X1U / 1X1UV2: FP32/FP16 forward - enabled
"MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U": "1", "MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U": "1",
"MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U": "1", "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U": "1",
"MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1UV2": "1", "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1UV2": "1",
# 5X10U2V2: fixed geometry (5*10 stride-2), no SD conv matches disabled # 5X10U2V2: fixed geometry (5*10 stride-2), no SD conv matches - disabled
"MIOPEN_DEBUG_CONV_DIRECT_ASM_5X10U2V2": "0", "MIOPEN_DEBUG_CONV_DIRECT_ASM_5X10U2V2": "0",
# 7X7C3H224W224: hard-coded ImageNet stem (C=3, H=W=224, K=64) — never matches SD — disabled # 7X7C3H224W224: hard-coded ImageNet stem (C=3, H=W=224, K=64) - never matches SD - disabled
"MIOPEN_DEBUG_CONV_DIRECT_ASM_7X7C3H224W224": "0", "MIOPEN_DEBUG_CONV_DIRECT_ASM_7X7C3H224W224": "0",
# WRW3X3 / WRW1X1: FP32-only weight-gradient (training only) disabled for inference # WRW3X3 / WRW1X1: FP32-only weight-gradient (training only) - disabled for inference
"MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW3X3": "0", "MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW3X3": "0",
"MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW1X1": "0", "MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW1X1": "0",
# PERF_VALS intentionally blank: MIOpen reads this as a config string not a boolean; # PERF_VALS intentionally blank: MIOpen reads this as a config string not a boolean;
@ -110,30 +110,30 @@ RDNA2: Dict[str, str] = {
"MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_PERF_VALS": "", "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_PERF_VALS": "",
"MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_SEARCH_OPTIMIZED": "1", "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_SEARCH_OPTIMIZED": "1",
"MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_AI_HEUR": "1", "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_AI_HEUR": "1",
# NAIVE_CONV_FWD: scalar FP32 reference solver IsApplicable does NOT reliably filter for FP16; # NAIVE_CONV_FWD: scalar FP32 reference solver - IsApplicable does NOT reliably filter for FP16;
# can be selected for unusual shapes (e.g. VAE decoder 3-ch output) and returns dtype=float32 # can be selected for unusual shapes (e.g. VAE decoder 3-ch output) and returns dtype=float32
"MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD": "0", "MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD": "0",
# Direct OCL dtype notes # Direct OCL - dtype notes
# FWD / FWD1X1: FP32/FP16 forward enabled # FWD / FWD1X1: FP32/FP16 forward - enabled
"MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD": "1", "MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD": "1",
"MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD1X1": "1", "MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD1X1": "1",
# FWD11X11: requires 11*11 kernel — no SD match — disabled # FWD11X11: requires 11*11 kernel - no SD match - disabled
"MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD11X11": "0", "MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD11X11": "0",
# FWDGEN: FP32 generic OCL fallback IsApplicable does NOT reliably reject for FP16; # FWDGEN: FP32 generic OCL fallback - IsApplicable does NOT reliably reject for FP16;
# can produce dtype=float32 output for FP16 inputs disabled # can produce dtype=float32 output for FP16 inputs - disabled
"MIOPEN_DEBUG_CONV_DIRECT_OCL_FWDGEN": "0", "MIOPEN_DEBUG_CONV_DIRECT_OCL_FWDGEN": "0",
# WRW2 / WRW53 / WRW1X1: training-only weight-gradient disabled # WRW2 / WRW53 / WRW1X1: training-only weight-gradient - disabled
"MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW2": "0", "MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW2": "0",
"MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW53": "0", "MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW53": "0",
"MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW1X1": "0", "MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW1X1": "0",
# Winograd RxS dtype per MIOpen docs # Winograd RxS - dtype per MIOpen docs
# WINOGRAD_3X3: FP32-only harmless (IsApplicable rejects for fp16); enabled # WINOGRAD_3X3: FP32-only - harmless (IsApplicable rejects for fp16); enabled
"MIOPEN_DEBUG_AMD_WINOGRAD_3X3": "1", "MIOPEN_DEBUG_AMD_WINOGRAD_3X3": "1",
# RXS: covers FP32/FP16 F(3,3) Fwd/Bwd + FP32 F(3,2) WrW keep enabled (fp16 fwd/bwd path exists) # RXS: covers FP32/FP16 F(3,3) Fwd/Bwd + FP32 F(3,2) WrW - keep enabled (fp16 fwd/bwd path exists)
"MIOPEN_DEBUG_AMD_WINOGRAD_RXS": "1", "MIOPEN_DEBUG_AMD_WINOGRAD_RXS": "1",
# RXS_FWD_BWD: FP32/FP16 explicitly the fp16-capable subset # RXS_FWD_BWD: FP32/FP16 - explicitly the fp16-capable subset
"MIOPEN_DEBUG_AMD_WINOGRAD_RXS_FWD_BWD": "1", "MIOPEN_DEBUG_AMD_WINOGRAD_RXS_FWD_BWD": "1",
# RXS_WRW: FP32 WrW only training-only, disabled for inference fp16 profile # RXS_WRW: FP32 WrW only - training-only, disabled for inference fp16 profile
"MIOPEN_DEBUG_AMD_WINOGRAD_RXS_WRW": "0", "MIOPEN_DEBUG_AMD_WINOGRAD_RXS_WRW": "0",
# RXS_F3X2: FP32/FP16 Fwd/Bwd # RXS_F3X2: FP32/FP16 Fwd/Bwd
"MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F3X2": "1", "MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F3X2": "1",
@ -141,15 +141,15 @@ RDNA2: Dict[str, str] = {
"MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3": "1", "MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3": "1",
# RXS_F2X3_G1: FP32/FP16 Fwd/Bwd (non-group convolutions) # RXS_F2X3_G1: FP32/FP16 Fwd/Bwd (non-group convolutions)
"MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3_G1": "1", "MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3_G1": "1",
# FUSED_WINOGRAD: FP32-only harmless (IsApplicable rejects for fp16); enabled # FUSED_WINOGRAD: FP32-only - harmless (IsApplicable rejects for fp16); enabled
"MIOPEN_DEBUG_AMD_FUSED_WINOGRAD": "1", "MIOPEN_DEBUG_AMD_FUSED_WINOGRAD": "1",
# PERF_VALS intentionally blank: same reason as ASM_1X1U not a boolean, config string # PERF_VALS intentionally blank: same reason as ASM_1X1U - not a boolean, config string
"MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3_PERF_VALS": "", "MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3_PERF_VALS": "",
# Fury/Rage Winograd NOT available on RDNA2 # Fury/Rage Winograd - NOT available on RDNA2
"MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F2X3": "0", "MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F2X3": "0",
"MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F3X2": "0", "MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F3X2": "0",
"MIOPEN_DEBUG_AMD_WINOGRAD_RAGE_RXS_F2X3": "0", "MIOPEN_DEBUG_AMD_WINOGRAD_RAGE_RXS_F2X3": "0",
# MPASS only F3x2 and F3x3 are safe on RDNA2 # MPASS - only F3x2 and F3x3 are safe on RDNA2
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X2": "1", "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X2": "1",
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X3": "1", "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X3": "1",
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X4": "0", "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X4": "0",
@ -159,50 +159,50 @@ RDNA2: Dict[str, str] = {
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F5X4": "0", "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F5X4": "0",
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X2": "0", "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X2": "0",
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X3": "0", "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X3": "0",
# ASM Implicit GEMM forward V4R1 only; no GTC/XDLOPS on RDNA2 # ASM Implicit GEMM - forward V4R1 only; no GTC/XDLOPS on RDNA2
# BWD (backward data-gradient) and WrW (weight-gradient) are training-only disabled # BWD (backward data-gradient) and WrW (weight-gradient) are training-only - disabled
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1": "1", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1": "1",
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1_1X1": "1", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1_1X1": "1",
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_V4R1": "0", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_V4R1": "0",
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_V4R1": "0", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_V4R1": "0",
# HIP Implicit GEMM non-XDLOPS V4R1/R4 forward only # HIP Implicit GEMM - non-XDLOPS V4R1/R4 forward only
# BWD (backward data-gradient) and WrW (weight-gradient) are training-only disabled # BWD (backward data-gradient) and WrW (weight-gradient) are training-only - disabled
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R1": "1", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R1": "1",
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4": "1", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4": "1",
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V1R1": "0", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V1R1": "0",
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V4R1": "0", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V4R1": "0",
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R1": "0", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R1": "0",
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4": "0", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4": "0",
# Group Conv XDLOPS / CK default kernels RDNA3/4 only, not available on RDNA2 # Group Conv XDLOPS / CK default kernels - RDNA3/4 only, not available on RDNA2
"MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS": "0", "MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS": "0",
"MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_AI_HEUR": "0", "MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_AI_HEUR": "0",
"MIOPEN_DEBUG_CK_DEFAULT_KERNELS": "0", "MIOPEN_DEBUG_CK_DEFAULT_KERNELS": "0",
} }
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# RDNA3 gfx1100 (RX 7000 series) # RDNA3 - gfx1100 (RX 7000 series)
# Fury Winograd added; MPASS F3x4 enabled; Group Conv XDLOPS + CK default kernels enabled # Fury Winograd added; MPASS F3x4 enabled; Group Conv XDLOPS + CK default kernels enabled
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
RDNA3: Dict[str, str] = { RDNA3: Dict[str, str] = {
**RDNA2, **RDNA2,
# Fury Winograd introduced for gfx1100 (RDNA3) # Fury Winograd - introduced for gfx1100 (RDNA3)
"MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F2X3": "1", "MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F2X3": "1",
"MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F3X2": "1", "MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F3X2": "1",
# Wider MPASS on RDNA3 # Wider MPASS on RDNA3
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X4": "1", "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X4": "1",
# Group Conv XDLOPS / CK available from gfx1100 (RDNA3) onwards # Group Conv XDLOPS / CK - available from gfx1100 (RDNA3) onwards
"MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS": "1", "MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS": "1",
"MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_AI_HEUR": "1", "MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_AI_HEUR": "1",
"MIOPEN_DEBUG_CK_DEFAULT_KERNELS": "1", "MIOPEN_DEBUG_CK_DEFAULT_KERNELS": "1",
} }
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# RDNA4 gfx1200 (RX 9000 series) # RDNA4 - gfx1200 (RX 9000 series)
# Rage Winograd added; MPASS F3x5 enabled # Rage Winograd added; MPASS F3x5 enabled
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
RDNA4: Dict[str, str] = { RDNA4: Dict[str, str] = {
**RDNA3, **RDNA3,
# Rage Winograd introduced for gfx1200 (RDNA4) # Rage Winograd - introduced for gfx1200 (RDNA4)
"MIOPEN_DEBUG_AMD_WINOGRAD_RAGE_RXS_F2X3": "1", "MIOPEN_DEBUG_AMD_WINOGRAD_RAGE_RXS_F2X3": "1",
# Wider MPASS on RDNA4 # Wider MPASS on RDNA4
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X5": "1", "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X5": "1",

View File

@ -1,39 +1,51 @@
from typing import Dict, Any, List, Tuple from typing import Dict, Any, List, Tuple
# --- General MIOpen/rocBLAS variables (dropdown/textbox/checkbox) --- # --- General MIOpen/rocBLAS variables (dropdown/textbox/checkbox) ---
GENERAL_VARS: Dict[str, Dict[str, Any]] = { GENERAL_VARS: Dict[str, Dict[str, Any]] = {
"MIOPEN_SYSTEM_DB_PATH": {
# ── GEMM backend selector + companion toggles ────────────────────────── "default": "{VIRTUAL_ENV}\\Lib\\site-packages\\_rocm_sdk_devel\\bin\\",
"desc": "MIOpen system DB path",
"widget": "textbox",
"options": None,
"restart_required": True,
},
"ROCBLAS_TENSILE_LIBPATH": {
"default": "{VIRTUAL_ENV}\\Lib\\site-packages\\_rocm_sdk_devel\\bin\\rocblas\\library",
"desc": "rocBLAS Tensile library path",
"widget": "textbox",
"options": None,
"restart_required": True,
},
# -- GEMM backend selector + companion toggles --------------------------
"MIOPEN_GEMM_ENFORCE_BACKEND": { "MIOPEN_GEMM_ENFORCE_BACKEND": {
"default": "1", "default": "1",
"desc": "Enforce GEMM backend", "desc": "GEMM backend",
"widget": "dropdown", "widget": "dropdown",
"options": [("1 - rocBLAS", "1"), ("5 - hipBLASLt", "5")], "options": [("1 - rocBLAS", "1"), ("5 - hipBLASLt", "5")],
"restart_required": False, "restart_required": False,
}, },
"PYTORCH_ROCM_USE_ROCBLAS": { "PYTORCH_ROCM_USE_ROCBLAS": {
"default": "0", "default": "0",
"desc": "PyTorch ROCm: prioritise rocBLAS for linear algebra", "desc": "PyTorch: Use rocBLAS.",
"widget": "dropdown", "widget": "dropdown",
"options": [("0 - Off", "0"), ("1 - On", "1")], "options": [("0 - Off", "0"), ("1 - On", "1")],
"restart_required": True, "restart_required": True,
}, },
"PYTORCH_HIPBLASLT_DISABLE": { "PYTORCH_HIPBLASLT_DISABLE": {
"default": "1", "default": "1",
"desc": "Disable PyTorch hipBLASLt dispatcher", "desc": "PyTorch: Use hipBLASLt.",
"widget": "dropdown", "widget": "dropdown",
"options": [("0 - Allow hipBLASLt", "0"), ("1 - Disable hipBLASLt", "1")], "options": [("0 - Allow hipBLASLt", "0"), ("1 - Disable hipBLASLt", "1")],
"restart_required": True, "restart_required": True,
}, },
"ROCBLAS_USE_HIPBLASLT": { "ROCBLAS_USE_HIPBLASLT": {
"default": "0", "default": "0",
"desc": "rocBLAS: use hipBLASLt backend (0 = Tensile)", "desc": "rocBLAS: use hipBLASLt backend.",
"widget": "dropdown", "widget": "dropdown",
"options": [("0 - Tensile (rocBLAS)", "0"), ("1 - hipBLASLt", "1")], "options": [("0 - Tensile (rocBLAS)", "0"), ("1 - hipBLASLt", "1")],
"restart_required": True, "restart_required": True,
}, },
# -- MIOpen behavioural settings ----------------------------------------
# ── MIOpen behavioural settings ────────────────────────────────────────
"MIOPEN_FIND_MODE": { "MIOPEN_FIND_MODE": {
"default": "2", "default": "2",
"desc": "MIOpen Find Mode", "desc": "MIOpen Find Mode",
@ -57,34 +69,21 @@ GENERAL_VARS: Dict[str, Dict[str, Any]] = {
}, },
"MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC": { "MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC": {
"default": "0", "default": "0",
"desc": "Deterministic convolution (reproducible results, may be slower)", "desc": "Deterministic convolutions",
"widget": "dropdown", "widget": "dropdown",
"options": [("0 - Off", "0"), ("1 - On", "1")], "options": [("0 - Off", "0"), ("1 - On", "1")],
"restart_required": False, "restart_required": False,
}, },
# ── Paths / sizes ────────────────────────────────────────────────────── # -- Paths / sizes ------------------------------------------------------
"MIOPEN_SYSTEM_DB_PATH": {
"default": "{VIRTUAL_ENV}\\Lib\\site-packages\\_rocm_sdk_devel\\bin\\",
"desc": "MIOpen system DB path",
"widget": "textbox",
"options": None,
"restart_required": True,
},
"MIOPEN_CONVOLUTION_MAX_WORKSPACE": { "MIOPEN_CONVOLUTION_MAX_WORKSPACE": {
"default": "1073741824", "default": "1073741824",
"desc": "MIOpen convolution max workspace (bytes; 1 GB default)", "desc": "MIOpen convolutions: max workspace (bytes; 1 GB)",
"widget": "textbox", "widget": "textbox",
"options": None, "options": None,
"restart_required": False, "restart_required": False,
}, },
"ROCBLAS_TENSILE_LIBPATH": {
"default": "{VIRTUAL_ENV}\\Lib\\site-packages\\_rocm_sdk_devel\\bin\\rocblas\\library",
"desc": "rocBLAS Tensile library path",
"widget": "textbox",
"options": None,
"restart_required": True,
},
"ROCBLAS_DEVICE_MEMORY_SIZE": { "ROCBLAS_DEVICE_MEMORY_SIZE": {
"default": "", "default": "",
"desc": "rocBLAS workspace size in bytes (empty = dynamic)", "desc": "rocBLAS workspace size in bytes (empty = dynamic)",
@ -94,13 +93,13 @@ GENERAL_VARS: Dict[str, Dict[str, Any]] = {
}, },
"PYTORCH_TUNABLEOP_CACHE_DIR": { "PYTORCH_TUNABLEOP_CACHE_DIR": {
"default": "{ROOT}\\models\\tunable", "default": "{ROOT}\\models\\tunable",
"desc": "TunableOp: kernel profile cache directory", "desc": "TunableOp cache directory",
"widget": "textbox", "widget": "textbox",
"options": None, "options": None,
"restart_required": False, "restart_required": False,
}, },
# ── rocBLAS settings ─────────────────────────────────────────────────── # -- rocBLAS settings ---------------------------------------------------
"ROCBLAS_STREAM_ORDER_ALLOC": { "ROCBLAS_STREAM_ORDER_ALLOC": {
"default": "1", "default": "1",
"desc": "rocBLAS stream-ordered memory allocation", "desc": "rocBLAS stream-ordered memory allocation",
@ -110,27 +109,27 @@ GENERAL_VARS: Dict[str, Dict[str, Any]] = {
}, },
"ROCBLAS_DEFAULT_ATOMICS_MODE": { "ROCBLAS_DEFAULT_ATOMICS_MODE": {
"default": "1", "default": "1",
"desc": "rocBLAS default atomics mode (1 = allow non-deterministic for performance)", "desc": "rocBLAS allow atomics",
"widget": "dropdown", "widget": "dropdown",
"options": [("0 - Off (deterministic)", "0"), ("1 - On (performance)", "1")], "options": [("0 - Off (deterministic)", "0"), ("1 - On (performance)", "1")],
"restart_required": False, "restart_required": False,
}, },
"PYTORCH_TUNABLEOP_ROCBLAS_ENABLED": { "PYTORCH_TUNABLEOP_ROCBLAS_ENABLED": {
"default": "0", "default": "0",
"desc": "TunableOp: wrap and optimise rocBLAS GEMM calls", "desc": "TunableOp: Enable tuning",
"widget": "dropdown", "widget": "dropdown",
"options": [("0 - Off", "0"), ("1 - On", "1")], "options": [("0 - Off", "0"), ("1 - On", "1")],
"restart_required": False, "restart_required": False,
}, },
"PYTORCH_TUNABLEOP_TUNING": { "PYTORCH_TUNABLEOP_TUNING": {
"default": "0", "default": "0",
"desc": "TunableOp: tuning mode (1 = benchmark; 0 = use saved CSV)", "desc": "TunableOp: Tuning mode",
"widget": "dropdown", "widget": "dropdown",
"options": [("0 - Use saved CSV", "0"), ("1 - Benchmark new shapes", "1")], "options": [("0 - Use Cache", "0"), ("1 - Benchmark new shapes", "1")],
"restart_required": False, "restart_required": False,
}, },
# ── hipBLASLt settings ───────────────────────────────────────────────── # -- hipBLASLt settings -------------------------------------------------
"PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED": { "PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED": {
"default": "0", "default": "0",
"desc": "TunableOp: benchmark hipBLASLt kernels", "desc": "TunableOp: benchmark hipBLASLt kernels",
@ -139,7 +138,7 @@ GENERAL_VARS: Dict[str, Dict[str, Any]] = {
"restart_required": False, "restart_required": False,
}, },
# ── Logging: MIOpen → rocBLAS → hipBLASLt ───────────────────────────── # -- Logging: MIOpen -> rocBLAS -> hipBLASLt -----------------------------
"MIOPEN_LOG_LEVEL": { "MIOPEN_LOG_LEVEL": {
"default": "0", "default": "0",
"desc": "MIOpen log verbosity level", "desc": "MIOpen log verbosity level",
@ -171,13 +170,13 @@ GENERAL_VARS: Dict[str, Dict[str, Any]] = {
} }
# --- Solver toggles (inference/FWD only, RDNA2/3/4 compatible) --- # --- Solver toggles (inference/FWD only, RDNA2/3/4 compatible) ---
# Removed entirely not representable in the UI, cannot be set by users: # Removed entirely - not representable in the UI, cannot be set by users:
# WRW (weight-gradient) and BWD (data-gradient) training passes only, never run during inference # WRW (weight-gradient) and BWD (data-gradient) - training passes only, never run during inference
# XDLOPS/CK CDNA-exclusive (MI100/MI200/MI300 matrix engine variants) not on any RDNA # XDLOPS/CK CDNA-exclusive (MI100/MI200/MI300 matrix engine variants) - not on any RDNA
# Fixed-geometry (5x10, 7x7-ImageNet, 11x11) shapes never appear in SD/video inference # Fixed-geometry (5x10, 7x7-ImageNet, 11x11) - shapes never appear in SD/video inference
# FP32-reference (NAIVE_CONV_FWD, FWDGEN) IsApplicable() unreliable for FP16/BF16 # FP32-reference (NAIVE_CONV_FWD, FWDGEN) - IsApplicable() unreliable for FP16/BF16
# Wide MPASS (F3x4..F7x3) kernel sizes that cannot match any SD convolution shape # Wide MPASS (F3x4..F7x3) - kernel sizes that cannot match any SD convolution shape
# Disabled by default (added but off): RDNA3/4-only Group Conv XDLOPS, CK default kernels # Disabled by default (added but off): RDNA3/4-only - Group Conv XDLOPS, CK default kernels
_SOLVER_DESCS: Dict[str, str] = {} _SOLVER_DESCS: Dict[str, str] = {}
_SOLVER_DESCS.update({ _SOLVER_DESCS.update({
@ -200,7 +199,7 @@ _SOLVER_DESCS.update({
"MIOPEN_DEBUG_ATTN_SOFTMAX": "Enable Attention Softmax", "MIOPEN_DEBUG_ATTN_SOFTMAX": "Enable Attention Softmax",
}) })
_SOLVER_DESCS.update({ _SOLVER_DESCS.update({
# Direct ASM FWD inference only (WRW, fixed-geometry, FP32-reference removed) # Direct ASM - FWD inference only (WRW, fixed-geometry, FP32-reference removed)
"MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U": "Enable Direct ASM 3x3U", "MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U": "Enable Direct ASM 3x3U",
"MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U": "Enable Direct ASM 1x1U", "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U": "Enable Direct ASM 1x1U",
"MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1UV2": "Enable Direct ASM 1x1UV2", "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1UV2": "Enable Direct ASM 1x1UV2",
@ -208,12 +207,12 @@ _SOLVER_DESCS.update({
"MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_AI_HEUR": "Enable Direct ASM 1x1U AI Heuristic", "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_AI_HEUR": "Enable Direct ASM 1x1U AI Heuristic",
}) })
_SOLVER_DESCS.update({ _SOLVER_DESCS.update({
# Direct OCL FWD inference only (WRW, FWD11X11 fixed-geom, FWDGEN FP32-ref removed) # Direct OCL - FWD inference only (WRW, FWD11X11 fixed-geom, FWDGEN FP32-ref removed)
"MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD": "Enable Direct OCL FWD", "MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD": "Enable Direct OCL FWD",
"MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD1X1": "Enable Direct OCL FWD1X1", "MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD1X1": "Enable Direct OCL FWD1X1",
}) })
_SOLVER_DESCS.update({ _SOLVER_DESCS.update({
# Winograd FWD WRW removed; Fury/Rage kept as RDNA3/4 inference (off by default) # Winograd FWD - WRW removed; Fury/Rage kept as RDNA3/4 inference (off by default)
"MIOPEN_DEBUG_AMD_WINOGRAD_3X3": "Enable AMD Winograd 3x3", "MIOPEN_DEBUG_AMD_WINOGRAD_3X3": "Enable AMD Winograd 3x3",
"MIOPEN_DEBUG_AMD_WINOGRAD_RXS": "Enable AMD Winograd RxS", "MIOPEN_DEBUG_AMD_WINOGRAD_RXS": "Enable AMD Winograd RxS",
"MIOPEN_DEBUG_AMD_WINOGRAD_RXS_FWD_BWD": "Enable AMD Winograd RxS FWD", "MIOPEN_DEBUG_AMD_WINOGRAD_RXS_FWD_BWD": "Enable AMD Winograd RxS FWD",
@ -226,32 +225,32 @@ _SOLVER_DESCS.update({
"MIOPEN_DEBUG_AMD_WINOGRAD_RAGE_RXS_F2X3": "Enable AMD Winograd Rage RxS F2x3", "MIOPEN_DEBUG_AMD_WINOGRAD_RAGE_RXS_F2X3": "Enable AMD Winograd Rage RxS F2x3",
}) })
_SOLVER_DESCS.update({ _SOLVER_DESCS.update({
# Multi-pass Winograd only F3x2/F3x3 match typical 3x3 SD shapes; wider kernels removed # Multi-pass Winograd - only F3x2/F3x3 match typical 3x3 SD shapes; wider kernels removed
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X2": "Enable AMD Winograd MPASS F3x2", "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X2": "Enable AMD Winograd MPASS F3x2",
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X3": "Enable AMD Winograd MPASS F3x3", "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X3": "Enable AMD Winograd MPASS F3x3",
}) })
_SOLVER_DESCS.update({ _SOLVER_DESCS.update({
# Implicit GEMM FWD BWD/WRW (training), CDNA-exclusive XDLOPS variants removed # Implicit GEMM FWD - BWD/WRW (training), CDNA-exclusive XDLOPS variants removed
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1": "Enable ASM Implicit GEMM FWD V4R1", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1": "Enable ASM Implicit GEMM FWD V4R1",
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1_1X1": "Enable ASM Implicit GEMM FWD V4R1 1x1", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1_1X1": "Enable ASM Implicit GEMM FWD V4R1 1x1",
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R1": "Enable HIP Implicit GEMM FWD V4R1", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R1": "Enable HIP Implicit GEMM FWD V4R1",
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4": "Enable HIP Implicit GEMM FWD V4R4", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4": "Enable HIP Implicit GEMM FWD V4R4",
}) })
_SOLVER_DESCS.update({ _SOLVER_DESCS.update({
# Group Conv XDLOPS FWD RDNA3/4 (gfx1100+) only; disabled by default # Group Conv XDLOPS FWD - RDNA3/4 (gfx1100+) only; disabled by default
"MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS": "Enable Group Conv Implicit GEMM XDLOPS FWD", "MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS": "Enable Group Conv Implicit GEMM XDLOPS FWD",
"MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_AI_HEUR": "Enable Group Conv Implicit GEMM XDLOPS FWD AI Heuristic", "MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_AI_HEUR": "Enable Group Conv Implicit GEMM XDLOPS FWD AI Heuristic",
# CK (Composable Kernel) default kernels RDNA3/4 (gfx1100+); disabled by default # CK (Composable Kernel) default kernels - RDNA3/4 (gfx1100+); disabled by default
"MIOPEN_DEBUG_CK_DEFAULT_KERNELS": "Enable CK (Composable Kernel) default kernels", "MIOPEN_DEBUG_CK_DEFAULT_KERNELS": "Enable CK (Composable Kernel) default kernels",
}) })
# Solvers still in the registry but disabled by default. # Solvers still in the registry but disabled by default.
# FORCE_IMMED_MODE_FALLBACK overrides FIND_MODE entirely, defeats tuning DB # FORCE_IMMED_MODE_FALLBACK - overrides FIND_MODE entirely, defeats tuning DB
# Fury RxS F2x3/F3x2 RDNA3/4-only; harmless on RDNA2 but won't select # Fury RxS F2x3/F3x2 - RDNA3/4-only; harmless on RDNA2 but won't select
# Rage RxS F2x3 RDNA4-only # Rage RxS F2x3 - RDNA4-only
# Group Conv XDLOPS RDNA3/4-only (gfx1100+) # Group Conv XDLOPS - RDNA3/4-only (gfx1100+)
# CK_DEFAULT_KERNELS RDNA3/4-only (gfx1100+) # CK_DEFAULT_KERNELS - RDNA3/4-only (gfx1100+)
SOLVER_DISABLED_BY_DEFAULT = { SOLVER_DISABLED_BY_DEFAULT = {
"MIOPEN_DEBUG_FORCE_IMMED_MODE_FALLBACK", "MIOPEN_DEBUG_FORCE_IMMED_MODE_FALLBACK",
"MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F2X3", "MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F2X3",