mirror of https://github.com/vladmandic/automatic
254 lines
14 KiB
Python
254 lines
14 KiB
Python
"""
|
|
Architecture-specific MIOpen solver profiles for AMD GCN/RDNA GPUs.
|
|
|
|
Sources:
|
|
https://rocm.docs.amd.com/projects/MIOpen/en/develop/reference/env_variables.html
|
|
|
|
Key axis: consumer RDNA GPUs have NO XDLOPS hardware (that's CDNA/Instinct only).
|
|
RDNA2 (gfx1030): RX 6000 series
|
|
RDNA3 (gfx1100): RX 7000 series — adds Fury Winograd, wider MPASS
|
|
RDNA4 (gfx1200): RX 9000 series — adds Rage Winograd, wider MPASS
|
|
|
|
Each profile is a dict of {var: value} that will be MERGED on top of the
|
|
current config (general vars like DB path / log level are preserved).
|
|
"""
|
|
|
|
from typing import Dict
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Shared: everything that must be OFF on ALL consumer RDNA (no XDLOPS hw)
|
|
# ---------------------------------------------------------------------------
|
|
_XDLOPS_OFF: Dict[str, str] = {
|
|
# GTC XDLOPS (CDNA-only)
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_GTC_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_GTC_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS_NHWC": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_GTC_XDLOPS_NHWC": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_GTC_XDLOPS_NHWC": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_DLOPS_NCHWC": "0",
|
|
# HIP XDLOPS variants (CDNA-only)
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R5_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V1R1_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V4R1_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4_PADDED_GEMM_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4_PADDED_GEMM_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_XDLOPS_EMULATE": "0",
|
|
"MIOPEN_DEBUG_IMPLICIT_GEMM_XDLOPS_INLINE_ASM": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_GROUP_BWD_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_BWD_XDLOPS_AI_HEUR": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_FWD_V4R4_XDLOPS_ADD_VECTOR_LOAD_GEMMN_TUNE_PARAM": "0",
|
|
# 3D XDLOPS (CDNA-only; no 3D conv XDLOPS on consumer RDNA)
|
|
"MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_BWD_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_WRW_XDLOPS": "0",
|
|
# Composable Kernel (requires XDLOPS / CDNA)
|
|
"MIOPEN_DEBUG_CONV_CK_IGEMM_FWD_V6R1_DLOPS_NCHW": "0",
|
|
"MIOPEN_DEBUG_CONV_CK_IGEMM_FWD_BIAS_ACTIV": "0",
|
|
"MIOPEN_DEBUG_CONV_CK_IGEMM_FWD_BIAS_RES_ADD_ACTIV": "0",
|
|
# MLIR (CDNA-only in practice)
|
|
"MIOPEN_DEBUG_CONV_MLIR_IGEMM_WRW_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_CONV_MLIR_IGEMM_BWD_XDLOPS": "0",
|
|
# MP BD Winograd (Multi-pass Block-Decomposed — CDNA / high-end only)
|
|
"MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F2X3": "0",
|
|
"MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F3X3": "0",
|
|
"MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F4X3": "0",
|
|
"MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F5X3": "0",
|
|
"MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F6X3": "0",
|
|
"MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F2X3": "0",
|
|
"MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F3X3": "0",
|
|
"MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F4X3": "0",
|
|
"MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F5X3": "0",
|
|
"MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F6X3": "0",
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# RDNA2 — gfx1030 (RX 6000 series)
|
|
# No XDLOPS, no Fury/Rage Winograd, MPASS limited to F3x2/F3x3
|
|
# ASM IGEMM: V4R1 variants only; HIP IGEMM: non-XDLOPS V4R1/R4 only
|
|
# ---------------------------------------------------------------------------
|
|
RDNA2: Dict[str, str] = {
|
|
**_XDLOPS_OFF,
|
|
# General settings (architecture-independent; set here so all profiles cover them)
|
|
"MIOPEN_SEARCH_CUTOFF": "0",
|
|
"MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC": "0",
|
|
# Core algo enables — FFT is FP32-only but harmless (IsApplicable rejects it for fp16 tensors)
|
|
"MIOPEN_DEBUG_CONV_FFT": "1",
|
|
"MIOPEN_DEBUG_CONV_DIRECT": "1",
|
|
"MIOPEN_DEBUG_CONV_GEMM": "1",
|
|
"MIOPEN_DEBUG_CONV_WINOGRAD": "1",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM": "1",
|
|
"MIOPEN_DEBUG_CONV_IMMED_FALLBACK": "1",
|
|
"MIOPEN_DEBUG_ENABLE_AI_IMMED_MODE_FALLBACK": "1",
|
|
"MIOPEN_DEBUG_FORCE_IMMED_MODE_FALLBACK": "0",
|
|
# Kernel backends
|
|
"MIOPEN_DEBUG_GCN_ASM_KERNELS": "1",
|
|
"MIOPEN_DEBUG_HIP_KERNELS": "1",
|
|
"MIOPEN_DEBUG_OPENCL_CONVOLUTIONS": "1",
|
|
"MIOPEN_DEBUG_OPENCL_WAVE64_NOWGP": "1",
|
|
"MIOPEN_DEBUG_ATTN_SOFTMAX": "1",
|
|
# Direct ASM — dtype notes
|
|
# 3X3U / 1X1U / 1X1UV2: FP32/FP16 forward — enabled
|
|
"MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U": "1",
|
|
"MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U": "1",
|
|
"MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1UV2": "1",
|
|
# 5X10U2V2: fixed geometry (5*10 stride-2), no SD conv matches — disabled
|
|
"MIOPEN_DEBUG_CONV_DIRECT_ASM_5X10U2V2": "0",
|
|
# 7X7C3H224W224: hard-coded ImageNet stem (C=3, H=W=224, K=64) — never matches SD — disabled
|
|
"MIOPEN_DEBUG_CONV_DIRECT_ASM_7X7C3H224W224": "0",
|
|
# WRW3X3 / WRW1X1: FP32-only weight-gradient (training only) — disabled for inference
|
|
"MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW3X3": "0",
|
|
"MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW1X1": "0",
|
|
# PERF_VALS intentionally blank: MIOpen reads this as a config string not a boolean;
|
|
# setting to "1" causes GetPerfConfFromEnv to use a degenerate config and return float32
|
|
"MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_PERF_VALS": "",
|
|
"MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_SEARCH_OPTIMIZED": "1",
|
|
"MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_AI_HEUR": "1",
|
|
# NAIVE_CONV_FWD: scalar FP32 reference solver — IsApplicable does NOT reliably filter for FP16;
|
|
# can be selected for unusual shapes (e.g. VAE decoder 3-ch output) and returns dtype=float32
|
|
"MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD": "0",
|
|
# Direct OCL — dtype notes
|
|
# FWD / FWD1X1: FP32/FP16 forward — enabled
|
|
"MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD": "1",
|
|
"MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD1X1": "1",
|
|
# FWD11X11: requires 11*11 kernel — no SD match — disabled
|
|
"MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD11X11": "0",
|
|
# FWDGEN: FP32 generic OCL fallback — IsApplicable does NOT reliably reject for FP16;
|
|
# can produce dtype=float32 output for FP16 inputs — disabled
|
|
"MIOPEN_DEBUG_CONV_DIRECT_OCL_FWDGEN": "0",
|
|
# WRW2 / WRW53 / WRW1X1: training-only weight-gradient — disabled
|
|
"MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW2": "0",
|
|
"MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW53": "0",
|
|
"MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW1X1": "0",
|
|
# Winograd RxS — dtype per MIOpen docs
|
|
# WINOGRAD_3X3: FP32-only — harmless (IsApplicable rejects for fp16); enabled
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_3X3": "1",
|
|
# RXS: covers FP32/FP16 F(3,3) Fwd/Bwd + FP32 F(3,2) WrW — keep enabled (fp16 fwd/bwd path exists)
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_RXS": "1",
|
|
# RXS_FWD_BWD: FP32/FP16 — explicitly the fp16-capable subset
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_RXS_FWD_BWD": "1",
|
|
# RXS_WRW: FP32 WrW only — training-only, disabled for inference fp16 profile
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_RXS_WRW": "0",
|
|
# RXS_F3X2: FP32/FP16 Fwd/Bwd
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F3X2": "1",
|
|
# RXS_F2X3: FP32/FP16 Fwd/Bwd (group convolutions)
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3": "1",
|
|
# RXS_F2X3_G1: FP32/FP16 Fwd/Bwd (non-group convolutions)
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3_G1": "1",
|
|
# FUSED_WINOGRAD: FP32-only — harmless (IsApplicable rejects for fp16); enabled
|
|
"MIOPEN_DEBUG_AMD_FUSED_WINOGRAD": "1",
|
|
# PERF_VALS intentionally blank: same reason as ASM_1X1U — not a boolean, config string
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3_PERF_VALS": "",
|
|
# Fury/Rage Winograd — NOT available on RDNA2
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F2X3": "0",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F3X2": "0",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_RAGE_RXS_F2X3": "0",
|
|
# MPASS — only F3x2 and F3x3 are safe on RDNA2
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X2": "1",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X3": "1",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X4": "0",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X5": "0",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X6": "0",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F5X3": "0",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F5X4": "0",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X2": "0",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X3": "0",
|
|
# ASM Implicit GEMM — forward V4R1 only; no GTC/XDLOPS on RDNA2
|
|
# BWD (backward data-gradient) and WrW (weight-gradient) are training-only — disabled
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1": "1",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1_1X1": "1",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_V4R1": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_V4R1": "0",
|
|
# HIP Implicit GEMM — non-XDLOPS V4R1/R4 forward only
|
|
# BWD (backward data-gradient) and WrW (weight-gradient) are training-only — disabled
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R1": "1",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4": "1",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V1R1": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V4R1": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R1": "0",
|
|
"MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4": "0",
|
|
# Group Conv XDLOPS / CK default kernels — RDNA3/4 only, not available on RDNA2
|
|
"MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS": "0",
|
|
"MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_AI_HEUR": "0",
|
|
"MIOPEN_DEBUG_CK_DEFAULT_KERNELS": "0",
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# RDNA3 — gfx1100 (RX 7000 series)
|
|
# Fury Winograd added; MPASS F3x4 enabled; Group Conv XDLOPS + CK default kernels enabled
|
|
# ---------------------------------------------------------------------------
|
|
RDNA3: Dict[str, str] = {
|
|
**RDNA2,
|
|
# Fury Winograd — introduced for gfx1100 (RDNA3)
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F2X3": "1",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F3X2": "1",
|
|
# Wider MPASS on RDNA3
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X4": "1",
|
|
# Group Conv XDLOPS / CK — available from gfx1100 (RDNA3) onwards
|
|
"MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS": "1",
|
|
"MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_AI_HEUR": "1",
|
|
"MIOPEN_DEBUG_CK_DEFAULT_KERNELS": "1",
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# RDNA4 — gfx1200 (RX 9000 series)
|
|
# Rage Winograd added; MPASS F3x5 enabled
|
|
# ---------------------------------------------------------------------------
|
|
RDNA4: Dict[str, str] = {
|
|
**RDNA3,
|
|
# Rage Winograd — introduced for gfx1200 (RDNA4)
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_RAGE_RXS_F2X3": "1",
|
|
# Wider MPASS on RDNA4
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X5": "1",
|
|
}
|
|
|
|
PROFILES: Dict[str, Dict[str, str]] = {
|
|
"RDNA2": RDNA2,
|
|
"RDNA3": RDNA3,
|
|
"RDNA4": RDNA4,
|
|
}
|
|
|
|
# Vars that are architecturally unavailable (no supporting hardware) per arch.
|
|
# These will be visually marked in the UI with strikethrough.
|
|
_UNAVAILABLE_ALL_RDNA = set(_XDLOPS_OFF.keys())
|
|
|
|
UNAVAILABLE: Dict[str, set] = {
|
|
"RDNA2": _UNAVAILABLE_ALL_RDNA | {
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F2X3",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_FURY_RXS_F3X2",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_RAGE_RXS_F2X3",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X4",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X5",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X6",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F5X3",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F5X4",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X2",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X3",
|
|
"MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS",
|
|
"MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS_AI_HEUR",
|
|
"MIOPEN_DEBUG_CK_DEFAULT_KERNELS",
|
|
},
|
|
"RDNA3": _UNAVAILABLE_ALL_RDNA | {
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_RAGE_RXS_F2X3",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X5",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X6",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F5X3",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F5X4",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X2",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X3",
|
|
},
|
|
"RDNA4": _UNAVAILABLE_ALL_RDNA | {
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X6",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F5X3",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F5X4",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X2",
|
|
"MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X3",
|
|
},
|
|
}
|