diff --git a/scripts/rocm/rocm_mgr.py b/scripts/rocm/rocm_mgr.py index ccfb90de9..ea987a421 100644 --- a/scripts/rocm/rocm_mgr.py +++ b/scripts/rocm/rocm_mgr.py @@ -29,17 +29,17 @@ CONFIG = Path(os.path.abspath(os.path.join('data', 'rocm.json'))) _cache: Optional[Dict[str, str]] = None # loaded once, invalidated on save # Metadata key written into rocm.json to record which architecture profile is active. -# Not an environment variable — always skipped during env application but preserved in the +# Not an environment variable - always skipped during env application but preserved in the # saved config so that arch-safety enforcement is consistent across restarts. _ARCH_KEY = "_rocm_arch" # Vars that must never appear in the process environment. # -# _DTYPE_UNSAFE: alter FP16 inference dtype — must be cleared regardless of config -# MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP16_ALT_IMPL — DEBUG alias: routes all FP16 convs through BF16 exponent math -# MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL — API-level alias: same BF16-exponent effect -# MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_EXPEREMENTAL_FP16_TRANSFORM — unstable experimental FP16 path -# MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_PK_ATOMIC_ADD_FP16 — changes FP16 WrW atomic accumulation +# _DTYPE_UNSAFE: alter FP16 inference dtype - must be cleared regardless of config +# MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP16_ALT_IMPL - DEBUG alias: routes all FP16 convs through BF16 exponent math +# MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL - API-level alias: same BF16-exponent effect +# MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_EXPEREMENTAL_FP16_TRANSFORM - unstable experimental FP16 path +# MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_PK_ATOMIC_ADD_FP16 - changes FP16 WrW atomic accumulation # # SOLVER_DISABLED_BY_DEFAULT: every solver known to be incompatible with this runtime # (FP32-only, training-only WrW/BWD, fixed-geometry mismatches, XDLOPS/CDNA-only, arch-specific). @@ -54,18 +54,18 @@ _DTYPE_UNSAFE = { # regardless of saved config. Limited to dtype-corrupting vars only. # IMPORTANT: SOLVER_DISABLED_BY_DEFAULT is intentionally NOT included here. # When a solver var is absent (unset) MIOpen still calls IsApplicable() on every -# conv-find — wasted probing overhead. When a var is explicitly "0" MIOpen skips +# conv-find - wasted probing overhead. When a var is explicitly "0" MIOpen skips # IsApplicable() immediately. Solver defaults flow through the config loop as "0" # (their ROCM_ENV_VARS default is "0") so they are explicitly set to "0" in the env. _UNSET_VARS = _DTYPE_UNSAFE # Additional environment vars that must be removed from the process before MIOpen loads. # These are not MIOpen solver toggles but can corrupt MIOpen's runtime behaviour: -# HIP_PATH / HIP_PATH_71 — point to the system AMD ROCm install; override the venv-bundled +# HIP_PATH / HIP_PATH_71 - point to the system AMD ROCm install; override the venv-bundled # _rocm_sdk_devel DLLs with a potentially mismatched system version -# QML_*/QT_* — QtQuick shader/disk-cache flags leaked from Qt tools; harmless for +# QML_*/QT_* - QtQuick shader/disk-cache flags leaked from Qt tools; harmless for # PyTorch but can conflict with Gradio's embedded Qt helpers -# PYENV_VIRTUALENV_DISABLE_PROMPT — pyenv noise that confuses venv detection +# PYENV_VIRTUALENV_DISABLE_PROMPT - pyenv noise that confuses venv detection _EXTRA_CLEAR_VARS = { "HIP_PATH", "HIP_PATH_71", @@ -73,7 +73,7 @@ _EXTRA_CLEAR_VARS = { "QML_DISABLE_DISK_CACHE", "QML_FORCE_DISK_CACHE", "QT_DISABLE_SHADER_DISK_CACHE", - # PERF_VALS vars are NOT boolean toggles — MIOpen reads them as perf-config strings. + # PERF_VALS vars are NOT boolean toggles - MIOpen reads them as perf-config strings. # If inherited from a parent shell with value "1", MIOpen's GetPerfConfFromEnv parses # "1" as a degenerate config and can return dtype=float32 output from FP16 tensors. "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_PERF_VALS", @@ -82,12 +82,12 @@ _EXTRA_CLEAR_VARS = { # Solvers whose MIOpen IsApplicable() explicitly rejects non-FP32 tensors. # They are safe to leave enabled in FP32 mode. When the active dtype is FP16 or BF16 -# we force them OFF so MIOpen skips the IsApplicable probe entirely — avoids overhead on +# we force them OFF so MIOpen skips the IsApplicable probe entirely - avoids overhead on # every conv shape find. These are NOT in _UNSET_VARS because they are valid in FP32. _FP32_ONLY_SOLVERS = { - "MIOPEN_DEBUG_CONV_FFT", # FFT convolution — FP32 only (MIOpen source: IsFp32 check) - "MIOPEN_DEBUG_AMD_WINOGRAD_3X3", # Winograd 3x3 — FP32 only - "MIOPEN_DEBUG_AMD_FUSED_WINOGRAD", # Fused Winograd — FP32 only + "MIOPEN_DEBUG_CONV_FFT", # FFT convolution - FP32 only (MIOpen source: IsFp32 check) + "MIOPEN_DEBUG_AMD_WINOGRAD_3X3", # Winograd 3x3 - FP32 only + "MIOPEN_DEBUG_AMD_FUSED_WINOGRAD", # Fused Winograd - FP32 only } @@ -172,7 +172,7 @@ def load_config() -> Dict[str, str]: _cache = data if data else {k: v["default"] for k, v in ROCM_ENV_VARS.items()} # Purge unsafe vars from a stale saved config and re-persist only if the file existed. # When running without a saved config (first run / after Delete), load_config() must - # never create the file — that only happens via save_config() on Apply or Apply Profile. + # never create the file - that only happens via save_config() on Apply or Apply Profile. dirty = {k for k in _cache if k in _UNSET_VARS or (k != _ARCH_KEY and k not in ROCM_ENV_VARS)} if dirty: _cache = {k: v for k, v in _cache.items() if k not in dirty} @@ -221,7 +221,7 @@ def apply_env(config: Optional[Dict[str, str]] = None) -> None: os.environ[var] = expanded # Arch safety net: hard-force all hardware-incompatible vars to "0" in the env. # This runs *after* the config loop so it overrides any stale "1" that survived in the JSON. - # Source of truth: rocm_profiles.UNAVAILABLE[arch] — vars with no supporting hardware. + # Source of truth: rocm_profiles.UNAVAILABLE[arch] - vars with no supporting hardware. arch = config.get(_ARCH_KEY, "") unavailable = rocm_profiles.UNAVAILABLE.get(arch, set()) if unavailable: @@ -249,7 +249,7 @@ def apply_all(names: list, values: list) -> None: meta = ROCM_ENV_VARS[name] if meta["widget"] == "checkbox": if value is None: - pass # Gradio passed None (component not interacted with) — leave config unchanged + pass # Gradio passed None (component not interacted with) - leave config unchanged else: config[name] = "1" if value else "0" elif meta["widget"] == "radio": @@ -257,7 +257,7 @@ def apply_all(names: list, values: list) -> None: valid = {v for _, v in meta["options"]} if meta["options"] and isinstance(meta["options"][0], tuple) else set(meta["options"] or []) if stored in valid: config[name] = stored - # else: value was None/invalid — leave the existing saved value untouched + # else: value was None/invalid - leave the existing saved value untouched else: if meta.get("options"): value = _dropdown_stored(str(value), meta["options"]) @@ -300,7 +300,7 @@ def delete_config() -> None: CONFIG.unlink() log.info(f'ROCm delete_config: deleted {CONFIG}') _cache = None - # Delete the MIOpen user DB (~/.miopen/db) — stale entries can cause solver mismatches + # Delete the MIOpen user DB (~/.miopen/db) - stale entries can cause solver mismatches miopen_db = Path(os.path.expanduser('~')) / '.miopen' / 'db' if miopen_db.exists(): shutil.rmtree(miopen_db, ignore_errors=True) @@ -458,7 +458,7 @@ def info() -> dict: if ufiles: udb["files"] = ufiles - # --- User cache (~/.miopen/cache/) --- + # User cache (~/.miopen/cache/) cache_base = Path.home() / ".miopen" / "cache" db_hash = _extract_db_hash(user_db_path) if user_db_path.exists() else "" cache_path = cache_base / db_hash if db_hash else cache_base diff --git a/scripts/rocm/rocm_profiles.py b/scripts/rocm/rocm_profiles.py index eeba0628e..62a401de8 100644 --- a/scripts/rocm/rocm_profiles.py +++ b/scripts/rocm/rocm_profiles.py @@ -1,4 +1,4 @@ -""" +""" Architecture-specific MIOpen solver profiles for AMD GCN/RDNA GPUs. Sources: @@ -6,8 +6,8 @@ Sources: Key axis: consumer RDNA GPUs have NO XDLOPS hardware (that's CDNA/Instinct only). RDNA2 (gfx1030): RX 6000 series - RDNA3 (gfx1100): RX 7000 series — adds Fury Winograd, wider MPASS - RDNA4 (gfx1200): RX 9000 series — adds Rage Winograd, wider MPASS + RDNA3 (gfx1100): RX 7000 series - adds Fury Winograd, wider MPASS + RDNA4 (gfx1200): RX 9000 series - adds Rage Winograd, wider MPASS Each profile is a dict of {var: value} that will be MERGED on top of the current config (general vars like DB path / log level are preserved). @@ -15,9 +15,9 @@ current config (general vars like DB path / log level are preserved). from typing import Dict -# --------------------------------------------------------------------------- + # Shared: everything that must be OFF on ALL consumer RDNA (no XDLOPS hw) -# --------------------------------------------------------------------------- + _XDLOPS_OFF: Dict[str, str] = { # GTC XDLOPS (CDNA-only) "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS": "0", @@ -55,7 +55,7 @@ _XDLOPS_OFF: Dict[str, str] = { # MLIR (CDNA-only in practice) "MIOPEN_DEBUG_CONV_MLIR_IGEMM_WRW_XDLOPS": "0", "MIOPEN_DEBUG_CONV_MLIR_IGEMM_BWD_XDLOPS": "0", - # MP BD Winograd (Multi-pass Block-Decomposed — CDNA / high-end only) + # MP BD Winograd (Multi-pass Block-Decomposed - CDNA / high-end only) "MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F2X3": "0", "MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F3X3": "0", "MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F4X3": "0", @@ -68,17 +68,17 @@ _XDLOPS_OFF: Dict[str, str] = { "MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F6X3": "0", } -# --------------------------------------------------------------------------- -# RDNA2 — gfx1030 (RX 6000 series) + +# RDNA2 - gfx1030 (RX 6000 series) # No XDLOPS, no Fury/Rage Winograd, MPASS limited to F3x2/F3x3 # ASM IGEMM: V4R1 variants only; HIP IGEMM: non-XDLOPS V4R1/R4 only -# --------------------------------------------------------------------------- + RDNA2: Dict[str, str] = { **_XDLOPS_OFF, # General settings (architecture-independent; set here so all profiles cover them) "MIOPEN_SEARCH_CUTOFF": "0", "MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC": "0", - # Core algo enables — FFT is FP32-only but harmless (IsApplicable rejects it for fp16 tensors) + # Core algo enables - FFT is FP32-only but harmless (IsApplicable rejects it for fp16 tensors) "MIOPEN_DEBUG_CONV_FFT": "1", "MIOPEN_DEBUG_CONV_DIRECT": "1", "MIOPEN_DEBUG_CONV_GEMM": "1", @@ -93,16 +93,16 @@ RDNA2: Dict[str, str] = { "MIOPEN_DEBUG_OPENCL_CONVOLUTIONS": "1", "MIOPEN_DEBUG_OPENCL_WAVE64_NOWGP": "1", "MIOPEN_DEBUG_ATTN_SOFTMAX": "1", - # Direct ASM — dtype notes - # 3X3U / 1X1U / 1X1UV2: FP32/FP16 forward — enabled + # Direct ASM - dtype notes + # 3X3U / 1X1U / 1X1UV2: FP32/FP16 forward - enabled "MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U": "1", "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U": "1", "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1UV2": "1", - # 5X10U2V2: fixed geometry (5*10 stride-2), no SD conv matches — disabled + # 5X10U2V2: fixed geometry (5*10 stride-2), no SD conv matches - disabled "MIOPEN_DEBUG_CONV_DIRECT_ASM_5X10U2V2": "0", - # 7X7C3H224W224: hard-coded ImageNet stem (C=3, H=W=224, K=64) — never matches SD — disabled + # 7X7C3H224W224: hard-coded ImageNet stem (C=3, H=W=224, K=64) - never matches SD - disabled "MIOPEN_DEBUG_CONV_DIRECT_ASM_7X7C3H224W224": "0", - # WRW3X3 / WRW1X1: FP32-only weight-gradient (training only) — disabled for inference + # WRW3X3 / WRW1X1: FP32-only weight-gradient (training only) - disabled for inference "MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW3X3": "0", "MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW1X1": "0", # PERF_VALS intentionally blank: MIOpen reads this as a config string not a boolean; @@ -110,30 +110,30 @@ RDNA2: Dict[str, str] = { "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_PERF_VALS": "", "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_SEARCH_OPTIMIZED": "1", "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_AI_HEUR": "1", - # NAIVE_CONV_FWD: scalar FP32 reference solver — IsApplicable does NOT reliably filter for FP16; + # NAIVE_CONV_FWD: scalar FP32 reference solver - IsApplicable does NOT reliably filter for FP16; # can be selected for unusual shapes (e.g. VAE decoder 3-ch output) and returns dtype=float32 "MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD": "0", - # Direct OCL — dtype notes - # FWD / FWD1X1: FP32/FP16 forward — enabled + # Direct OCL - dtype notes + # FWD / FWD1X1: FP32/FP16 forward - enabled "MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD": "1", "MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD1X1": "1", - # FWD11X11: requires 11*11 kernel — no SD match — disabled + # FWD11X11: requires 11*11 kernel - no SD match - disabled "MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD11X11": "0", - # FWDGEN: FP32 generic OCL fallback — IsApplicable does NOT reliably reject for FP16; - # can produce dtype=float32 output for FP16 inputs — disabled + # FWDGEN: FP32 generic OCL fallback - IsApplicable does NOT reliably reject for FP16; + # can produce dtype=float32 output for FP16 inputs - disabled "MIOPEN_DEBUG_CONV_DIRECT_OCL_FWDGEN": "0", - # WRW2 / WRW53 / WRW1X1: training-only weight-gradient — disabled + # WRW2 / WRW53 / WRW1X1: training-only weight-gradient - disabled "MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW2": "0", "MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW53": "0", "MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW1X1": "0", - # Winograd RxS — dtype per MIOpen docs - # WINOGRAD_3X3: FP32-only — harmless (IsApplicable rejects for fp16); enabled + # Winograd RxS - dtype per MIOpen docs + # WINOGRAD_3X3: FP32-only - harmless (IsApplicable rejects for fp16); enabled "MIOPEN_DEBUG_AMD_WINOGRAD_3X3": "1", - # RXS: covers FP32/FP16 F(3,3) Fwd/Bwd + FP32 F(3,2) WrW — keep enabled (fp16 fwd/bwd path exists) + # RXS: covers FP32/FP16 F(3,3) Fwd/Bwd + FP32 F(3,2) WrW - keep enabled (fp16 fwd/bwd path exists) "MIOPEN_DEBUG_AMD_WINOGRAD_RXS": "1", - # RXS_FWD_BWD: FP32/FP16 — explicitly the fp16-capable subset + # RXS_FWD_BWD: FP32/FP16 - explicitly the fp16-capable subset "MIOPEN_DEBUG_AMD_WINOGRAD_RXS_FWD_BWD": "1", - # RXS_WRW: FP32 WrW only — training-only, disabled for inference fp16 profile + # RXS_WRW: FP32 WrW only - training-only, disabled for inference fp16 profile "MIOPEN_DEBUG_AMD_WINOGRAD_RXS_WRW": "0", # RXS_F3X2: FP32/FP16 Fwd/Bwd "MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F3X2": "1", @@ -141,15 +141,15 @@ RDNA2: Dict[str, str] = { "MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3": "1", # RXS_F2X3_G1: FP32/FP16 Fwd/Bwd (non-group convolutions) "MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3_G1": "1", - # FUSED_WINOGRAD: FP32-only — harmless (IsApplicable rejects for fp16); enabled + # FUSED_WINOGRAD: FP32-only - harmless (IsApplicable rejects for fp16); enabled "MIOPEN_DEBUG_AMD_FUSED_WINOGRAD": "1", - # PERF_VALS intentionally blank: same reason as ASM_1X1U — not a boolean, config string + # PERF_VALS intentionally blank: same reason as ASM_1X1U - not a boolean, config string "MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3_PERF_VALS": "", - # Fury/Rage Winograd — NOT available on RDNA2 + # Fury/Rage Winograd - NOT available on RDNA2 "MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F2X3": "0", "MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F3X2": "0", "MIOPEN_DEBUG_AMD_WINOGRAD_RAGE_RXS_F2X3": "0", - # MPASS — only F3x2 and F3x3 are safe on RDNA2 + # MPASS - only F3x2 and F3x3 are safe on RDNA2 "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X2": "1", "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X3": "1", "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X4": "0", @@ -159,50 +159,50 @@ RDNA2: Dict[str, str] = { "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F5X4": "0", "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X2": "0", "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X3": "0", - # ASM Implicit GEMM — forward V4R1 only; no GTC/XDLOPS on RDNA2 - # BWD (backward data-gradient) and WrW (weight-gradient) are training-only — disabled + # ASM Implicit GEMM - forward V4R1 only; no GTC/XDLOPS on RDNA2 + # BWD (backward data-gradient) and WrW (weight-gradient) are training-only - disabled "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1": "1", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1_1X1": "1", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_V4R1": "0", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_V4R1": "0", - # HIP Implicit GEMM — non-XDLOPS V4R1/R4 forward only - # BWD (backward data-gradient) and WrW (weight-gradient) are training-only — disabled + # HIP Implicit GEMM - non-XDLOPS V4R1/R4 forward only + # BWD (backward data-gradient) and WrW (weight-gradient) are training-only - disabled "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R1": "1", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4": "1", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V1R1": "0", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V4R1": "0", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R1": "0", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4": "0", - # Group Conv XDLOPS / CK default kernels — RDNA3/4 only, not available on RDNA2 + # Group Conv XDLOPS / CK default kernels - RDNA3/4 only, not available on RDNA2 "MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS": "0", "MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_AI_HEUR": "0", "MIOPEN_DEBUG_CK_DEFAULT_KERNELS": "0", } # --------------------------------------------------------------------------- -# RDNA3 — gfx1100 (RX 7000 series) +# RDNA3 - gfx1100 (RX 7000 series) # Fury Winograd added; MPASS F3x4 enabled; Group Conv XDLOPS + CK default kernels enabled # --------------------------------------------------------------------------- RDNA3: Dict[str, str] = { **RDNA2, - # Fury Winograd — introduced for gfx1100 (RDNA3) + # Fury Winograd - introduced for gfx1100 (RDNA3) "MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F2X3": "1", "MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F3X2": "1", # Wider MPASS on RDNA3 "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X4": "1", - # Group Conv XDLOPS / CK — available from gfx1100 (RDNA3) onwards + # Group Conv XDLOPS / CK - available from gfx1100 (RDNA3) onwards "MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS": "1", "MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_AI_HEUR": "1", "MIOPEN_DEBUG_CK_DEFAULT_KERNELS": "1", } # --------------------------------------------------------------------------- -# RDNA4 — gfx1200 (RX 9000 series) +# RDNA4 - gfx1200 (RX 9000 series) # Rage Winograd added; MPASS F3x5 enabled # --------------------------------------------------------------------------- RDNA4: Dict[str, str] = { **RDNA3, - # Rage Winograd — introduced for gfx1200 (RDNA4) + # Rage Winograd - introduced for gfx1200 (RDNA4) "MIOPEN_DEBUG_AMD_WINOGRAD_RAGE_RXS_F2X3": "1", # Wider MPASS on RDNA4 "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X5": "1", diff --git a/scripts/rocm/rocm_vars.py b/scripts/rocm/rocm_vars.py index 0e2235015..1cf30231f 100644 --- a/scripts/rocm/rocm_vars.py +++ b/scripts/rocm/rocm_vars.py @@ -1,39 +1,51 @@ -from typing import Dict, Any, List, Tuple +from typing import Dict, Any, List, Tuple # --- General MIOpen/rocBLAS variables (dropdown/textbox/checkbox) --- GENERAL_VARS: Dict[str, Dict[str, Any]] = { - - # ── GEMM backend selector + companion toggles ────────────────────────── + "MIOPEN_SYSTEM_DB_PATH": { + "default": "{VIRTUAL_ENV}\\Lib\\site-packages\\_rocm_sdk_devel\\bin\\", + "desc": "MIOpen system DB path", + "widget": "textbox", + "options": None, + "restart_required": True, + }, + "ROCBLAS_TENSILE_LIBPATH": { + "default": "{VIRTUAL_ENV}\\Lib\\site-packages\\_rocm_sdk_devel\\bin\\rocblas\\library", + "desc": "rocBLAS Tensile library path", + "widget": "textbox", + "options": None, + "restart_required": True, + }, + # -- GEMM backend selector + companion toggles -------------------------- "MIOPEN_GEMM_ENFORCE_BACKEND": { "default": "1", - "desc": "Enforce GEMM backend", + "desc": "GEMM backend", "widget": "dropdown", "options": [("1 - rocBLAS", "1"), ("5 - hipBLASLt", "5")], "restart_required": False, }, "PYTORCH_ROCM_USE_ROCBLAS": { "default": "0", - "desc": "PyTorch ROCm: prioritise rocBLAS for linear algebra", + "desc": "PyTorch: Use rocBLAS.", "widget": "dropdown", "options": [("0 - Off", "0"), ("1 - On", "1")], "restart_required": True, }, "PYTORCH_HIPBLASLT_DISABLE": { "default": "1", - "desc": "Disable PyTorch hipBLASLt dispatcher", + "desc": "PyTorch: Use hipBLASLt.", "widget": "dropdown", "options": [("0 - Allow hipBLASLt", "0"), ("1 - Disable hipBLASLt", "1")], "restart_required": True, }, "ROCBLAS_USE_HIPBLASLT": { "default": "0", - "desc": "rocBLAS: use hipBLASLt backend (0 = Tensile)", + "desc": "rocBLAS: use hipBLASLt backend.", "widget": "dropdown", "options": [("0 - Tensile (rocBLAS)", "0"), ("1 - hipBLASLt", "1")], "restart_required": True, }, - - # ── MIOpen behavioural settings ──────────────────────────────────────── + # -- MIOpen behavioural settings ---------------------------------------- "MIOPEN_FIND_MODE": { "default": "2", "desc": "MIOpen Find Mode", @@ -57,34 +69,21 @@ GENERAL_VARS: Dict[str, Dict[str, Any]] = { }, "MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC": { "default": "0", - "desc": "Deterministic convolution (reproducible results, may be slower)", + "desc": "Deterministic convolutions", "widget": "dropdown", "options": [("0 - Off", "0"), ("1 - On", "1")], "restart_required": False, }, - # ── Paths / sizes ────────────────────────────────────────────────────── - "MIOPEN_SYSTEM_DB_PATH": { - "default": "{VIRTUAL_ENV}\\Lib\\site-packages\\_rocm_sdk_devel\\bin\\", - "desc": "MIOpen system DB path", - "widget": "textbox", - "options": None, - "restart_required": True, - }, + # -- Paths / sizes ------------------------------------------------------ + "MIOPEN_CONVOLUTION_MAX_WORKSPACE": { "default": "1073741824", - "desc": "MIOpen convolution max workspace (bytes; 1 GB default)", + "desc": "MIOpen convolutions: max workspace (bytes; 1 GB)", "widget": "textbox", "options": None, "restart_required": False, }, - "ROCBLAS_TENSILE_LIBPATH": { - "default": "{VIRTUAL_ENV}\\Lib\\site-packages\\_rocm_sdk_devel\\bin\\rocblas\\library", - "desc": "rocBLAS Tensile library path", - "widget": "textbox", - "options": None, - "restart_required": True, - }, "ROCBLAS_DEVICE_MEMORY_SIZE": { "default": "", "desc": "rocBLAS workspace size in bytes (empty = dynamic)", @@ -94,13 +93,13 @@ GENERAL_VARS: Dict[str, Dict[str, Any]] = { }, "PYTORCH_TUNABLEOP_CACHE_DIR": { "default": "{ROOT}\\models\\tunable", - "desc": "TunableOp: kernel profile cache directory", + "desc": "TunableOp cache directory", "widget": "textbox", "options": None, "restart_required": False, }, - # ── rocBLAS settings ─────────────────────────────────────────────────── + # -- rocBLAS settings --------------------------------------------------- "ROCBLAS_STREAM_ORDER_ALLOC": { "default": "1", "desc": "rocBLAS stream-ordered memory allocation", @@ -110,27 +109,27 @@ GENERAL_VARS: Dict[str, Dict[str, Any]] = { }, "ROCBLAS_DEFAULT_ATOMICS_MODE": { "default": "1", - "desc": "rocBLAS default atomics mode (1 = allow non-deterministic for performance)", + "desc": "rocBLAS allow atomics", "widget": "dropdown", "options": [("0 - Off (deterministic)", "0"), ("1 - On (performance)", "1")], "restart_required": False, }, "PYTORCH_TUNABLEOP_ROCBLAS_ENABLED": { "default": "0", - "desc": "TunableOp: wrap and optimise rocBLAS GEMM calls", + "desc": "TunableOp: Enable tuning", "widget": "dropdown", "options": [("0 - Off", "0"), ("1 - On", "1")], "restart_required": False, }, "PYTORCH_TUNABLEOP_TUNING": { "default": "0", - "desc": "TunableOp: tuning mode (1 = benchmark; 0 = use saved CSV)", + "desc": "TunableOp: Tuning mode", "widget": "dropdown", - "options": [("0 - Use saved CSV", "0"), ("1 - Benchmark new shapes", "1")], + "options": [("0 - Use Cache", "0"), ("1 - Benchmark new shapes", "1")], "restart_required": False, }, - # ── hipBLASLt settings ───────────────────────────────────────────────── + # -- hipBLASLt settings ------------------------------------------------- "PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED": { "default": "0", "desc": "TunableOp: benchmark hipBLASLt kernels", @@ -139,7 +138,7 @@ GENERAL_VARS: Dict[str, Dict[str, Any]] = { "restart_required": False, }, - # ── Logging: MIOpen → rocBLAS → hipBLASLt ───────────────────────────── + # -- Logging: MIOpen -> rocBLAS -> hipBLASLt ----------------------------- "MIOPEN_LOG_LEVEL": { "default": "0", "desc": "MIOpen log verbosity level", @@ -171,13 +170,13 @@ GENERAL_VARS: Dict[str, Dict[str, Any]] = { } # --- Solver toggles (inference/FWD only, RDNA2/3/4 compatible) --- -# Removed entirely — not representable in the UI, cannot be set by users: -# WRW (weight-gradient) and BWD (data-gradient) — training passes only, never run during inference -# XDLOPS/CK CDNA-exclusive (MI100/MI200/MI300 matrix engine variants) — not on any RDNA -# Fixed-geometry (5x10, 7x7-ImageNet, 11x11) — shapes never appear in SD/video inference -# FP32-reference (NAIVE_CONV_FWD, FWDGEN) — IsApplicable() unreliable for FP16/BF16 -# Wide MPASS (F3x4..F7x3) — kernel sizes that cannot match any SD convolution shape -# Disabled by default (added but off): RDNA3/4-only — Group Conv XDLOPS, CK default kernels +# Removed entirely - not representable in the UI, cannot be set by users: +# WRW (weight-gradient) and BWD (data-gradient) - training passes only, never run during inference +# XDLOPS/CK CDNA-exclusive (MI100/MI200/MI300 matrix engine variants) - not on any RDNA +# Fixed-geometry (5x10, 7x7-ImageNet, 11x11) - shapes never appear in SD/video inference +# FP32-reference (NAIVE_CONV_FWD, FWDGEN) - IsApplicable() unreliable for FP16/BF16 +# Wide MPASS (F3x4..F7x3) - kernel sizes that cannot match any SD convolution shape +# Disabled by default (added but off): RDNA3/4-only - Group Conv XDLOPS, CK default kernels _SOLVER_DESCS: Dict[str, str] = {} _SOLVER_DESCS.update({ @@ -200,7 +199,7 @@ _SOLVER_DESCS.update({ "MIOPEN_DEBUG_ATTN_SOFTMAX": "Enable Attention Softmax", }) _SOLVER_DESCS.update({ - # Direct ASM — FWD inference only (WRW, fixed-geometry, FP32-reference removed) + # Direct ASM - FWD inference only (WRW, fixed-geometry, FP32-reference removed) "MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U": "Enable Direct ASM 3x3U", "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U": "Enable Direct ASM 1x1U", "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1UV2": "Enable Direct ASM 1x1UV2", @@ -208,12 +207,12 @@ _SOLVER_DESCS.update({ "MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_AI_HEUR": "Enable Direct ASM 1x1U AI Heuristic", }) _SOLVER_DESCS.update({ - # Direct OCL — FWD inference only (WRW, FWD11X11 fixed-geom, FWDGEN FP32-ref removed) + # Direct OCL - FWD inference only (WRW, FWD11X11 fixed-geom, FWDGEN FP32-ref removed) "MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD": "Enable Direct OCL FWD", "MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD1X1": "Enable Direct OCL FWD1X1", }) _SOLVER_DESCS.update({ - # Winograd FWD — WRW removed; Fury/Rage kept as RDNA3/4 inference (off by default) + # Winograd FWD - WRW removed; Fury/Rage kept as RDNA3/4 inference (off by default) "MIOPEN_DEBUG_AMD_WINOGRAD_3X3": "Enable AMD Winograd 3x3", "MIOPEN_DEBUG_AMD_WINOGRAD_RXS": "Enable AMD Winograd RxS", "MIOPEN_DEBUG_AMD_WINOGRAD_RXS_FWD_BWD": "Enable AMD Winograd RxS FWD", @@ -226,32 +225,32 @@ _SOLVER_DESCS.update({ "MIOPEN_DEBUG_AMD_WINOGRAD_RAGE_RXS_F2X3": "Enable AMD Winograd Rage RxS F2x3", }) _SOLVER_DESCS.update({ - # Multi-pass Winograd — only F3x2/F3x3 match typical 3x3 SD shapes; wider kernels removed + # Multi-pass Winograd - only F3x2/F3x3 match typical 3x3 SD shapes; wider kernels removed "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X2": "Enable AMD Winograd MPASS F3x2", "MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X3": "Enable AMD Winograd MPASS F3x3", }) _SOLVER_DESCS.update({ - # Implicit GEMM FWD — BWD/WRW (training), CDNA-exclusive XDLOPS variants removed + # Implicit GEMM FWD - BWD/WRW (training), CDNA-exclusive XDLOPS variants removed "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1": "Enable ASM Implicit GEMM FWD V4R1", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1_1X1": "Enable ASM Implicit GEMM FWD V4R1 1x1", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R1": "Enable HIP Implicit GEMM FWD V4R1", "MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4": "Enable HIP Implicit GEMM FWD V4R4", }) _SOLVER_DESCS.update({ - # Group Conv XDLOPS FWD — RDNA3/4 (gfx1100+) only; disabled by default + # Group Conv XDLOPS FWD - RDNA3/4 (gfx1100+) only; disabled by default "MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS": "Enable Group Conv Implicit GEMM XDLOPS FWD", "MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_AI_HEUR": "Enable Group Conv Implicit GEMM XDLOPS FWD AI Heuristic", - # CK (Composable Kernel) default kernels — RDNA3/4 (gfx1100+); disabled by default + # CK (Composable Kernel) default kernels - RDNA3/4 (gfx1100+); disabled by default "MIOPEN_DEBUG_CK_DEFAULT_KERNELS": "Enable CK (Composable Kernel) default kernels", }) # Solvers still in the registry but disabled by default. -# FORCE_IMMED_MODE_FALLBACK — overrides FIND_MODE entirely, defeats tuning DB -# Fury RxS F2x3/F3x2 — RDNA3/4-only; harmless on RDNA2 but won't select -# Rage RxS F2x3 — RDNA4-only -# Group Conv XDLOPS — RDNA3/4-only (gfx1100+) -# CK_DEFAULT_KERNELS — RDNA3/4-only (gfx1100+) +# FORCE_IMMED_MODE_FALLBACK - overrides FIND_MODE entirely, defeats tuning DB +# Fury RxS F2x3/F3x2 - RDNA3/4-only; harmless on RDNA2 but won't select +# Rage RxS F2x3 - RDNA4-only +# Group Conv XDLOPS - RDNA3/4-only (gfx1100+) +# CK_DEFAULT_KERNELS - RDNA3/4-only (gfx1100+) SOLVER_DISABLED_BY_DEFAULT = { "MIOPEN_DEBUG_FORCE_IMMED_MODE_FALLBACK", "MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F2X3",