diff --git a/CHANGELOG.md b/CHANGELOG.md index 8151bbe2f..41823eca8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,17 +15,19 @@ Plus some significant under-the-hood changes to improve code coverage and qualit - **Models** - [Flux.2 Klein](https://bfl.ai/blog/flux2-klein-towards-interactive-visual-intelligence) Flux.2-Klein is a new family of compact models from BFL in *4B and 9B sizes* and avaialable as *destilled and base* variants - also includes are *SDNQ prequantized variants* + also includes are *sdnq prequantized variants* - [Qwen-Image-2512](https://qwen.ai/blog?id=qwen-image-2512) Qwen-Image successor, significantly reduces the AI-generated look and adds finer natural detailils and improved text rendering available in both *original*, *sdnq-svd prequantized* and *sdnq-dynamic prequantized* variants + thanks @CalamitousFelicitousness - [LTX-2 19B Dev](https://ltx.io/model/ltx-2) LTX-2 is a new very large 19B parameter video generation model from Lightricks using Gemma-3 text encoder - available for T2I/I2I workflows in original and SDNQ prequantized variants + available for T2I/I2I workflows in original and sdnq prequantized variants *note*: audio generation and upsampling are not yet supported (soon) - [GLM-Image](https://z.ai/blog/glm-image) GLM-image is a new image generation model that adopts a hybrid autoregressive with diffusion decoder architecture - available in both *original* and *sdnq-dynamic prequantized* variants, thanks @CalamitousFelicitousness + available in both *original* and *sdnq-dynamic prequantized* variants + thanks @CalamitousFelicitousness *note*: model requires pre-release versions of `transformers` package: > pip install --upgrade git+https://github.com/huggingface/transformers.git > ./webui.sh --experimental diff --git a/installer.py b/installer.py index 08a4bc3e0..c3bfe2c71 100644 --- a/installer.py +++ b/installer.py @@ -1400,6 +1400,7 @@ def set_environment(): log.debug('Setting environment tuning') os.environ.setdefault('ACCELERATE', 'True') os.environ.setdefault('ATTN_PRECISION', 'fp16') + os.environ.setdefault('ClDeviceGlobalMemSizeAvailablePercent', '100') os.environ.setdefault('CUDA_AUTO_BOOST', '1') os.environ.setdefault('CUDA_CACHE_DISABLE', '0') os.environ.setdefault('CUDA_DEVICE_DEFAULT_PERSISTING_L2_CACHE_PERCENTAGE_LIMIT', '0') @@ -1410,20 +1411,27 @@ def set_environment(): os.environ.setdefault('GRADIO_ANALYTICS_ENABLED', 'False') os.environ.setdefault('K_DIFFUSION_USE_COMPILE', '0') os.environ.setdefault('KINETO_LOG_LEVEL', '3') + os.environ.setdefault('NEOReadDebugKeys', '1') os.environ.setdefault('NUMEXPR_MAX_THREADS', '16') os.environ.setdefault('PYTHONHTTPSVERIFY', '0') + os.environ.setdefault('PYTORCH_ENABLE_MPS_FALLBACK', '1') + os.environ.setdefault('PYTORCH_ENABLE_XPU_FALLBACK', '1') + os.environ.setdefault('RUNAI_STREAMER_CHUNK_BYTESIZE', '2097152') + os.environ.setdefault('RUNAI_STREAMER_LOG_LEVEL', 'DEBUG' if os.environ.get('SD_LOAD_DEBUG') else 'WARNING') + os.environ.setdefault('RUNAI_STREAMER_MEMORY_LIMIT', '-1') os.environ.setdefault('SAFETENSORS_FAST_GPU', '1') + os.environ.setdefault('SYCL_CACHE_PERSISTENT', '1') os.environ.setdefault('TF_CPP_MIN_LOG_LEVEL', '2') os.environ.setdefault('TF_ENABLE_ONEDNN_OPTS', '0') + os.environ.setdefault('TOKENIZERS_PARALLELISM', '0') os.environ.setdefault('TORCH_CUDNN_V8_API_ENABLED', '1') os.environ.setdefault('TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD', '1') + os.environ.setdefault('TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL', '1') + os.environ.setdefault('UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS', '1') os.environ.setdefault('USE_TORCH', '1') os.environ.setdefault('UV_INDEX_STRATEGY', 'unsafe-any-match') os.environ.setdefault('UV_NO_BUILD_ISOLATION', '1') os.environ.setdefault('UVICORN_TIMEOUT_KEEP_ALIVE', '60') - os.environ.setdefault('RUNAI_STREAMER_CHUNK_BYTESIZE', '2097152') - os.environ.setdefault('RUNAI_STREAMER_MEMORY_LIMIT', '-1') - os.environ.setdefault('RUNAI_STREAMER_LOG_LEVEL', 'DEBUG' if os.environ.get('SD_LOAD_DEBUG') else 'WARNING') allocator = f'garbage_collection_threshold:{opts.get("torch_gc_threshold", 80)/100:0.2f},max_split_size_mb:512' if opts.get("torch_malloc", "native") == 'cudaMallocAsync': allocator += ',backend:cudaMallocAsync' @@ -1433,14 +1441,6 @@ def set_environment(): os.environ.setdefault('PYTORCH_CUDA_ALLOC_CONF', allocator) os.environ.setdefault('PYTORCH_HIP_ALLOC_CONF', allocator) log.debug(f'Torch allocator: "{allocator}"') - os.environ.setdefault('TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL', '1') - os.environ.setdefault('NEOReadDebugKeys', '1') - os.environ.setdefault('ClDeviceGlobalMemSizeAvailablePercent', '100') - os.environ.setdefault('SYCL_CACHE_PERSISTENT', '1') - os.environ.setdefault('UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS', '1') - os.environ.setdefault('PYTORCH_ENABLE_XPU_FALLBACK', '1') - os.environ.setdefault('PYTORCH_ENABLE_MPS_FALLBACK', '1') - os.environ.setdefault('TOKENIZERS_PARALLELISM', '0') def check_extensions(): diff --git a/launch.py b/launch.py index f74e96584..e0fbc61ed 100755 --- a/launch.py +++ b/launch.py @@ -68,8 +68,7 @@ def get_custom_args(): installer.log.trace(f'Environment: {installer.print_dict(env)}') env = [f'{k}={v}' for k, v in os.environ.items() if k.startswith('SD_')] ld = [f'{k}={v}' for k, v in os.environ.items() if k.startswith('LD_')] - compute = [f'{k}={v}' for k, v in os.environ.items() if 'TORCH' in k or 'CUDA' in k or 'ROCM' in k or 'MIOPEN' in k] - installer.log.debug(f'Flags: sd={env} ld={ld} compute={compute}') + installer.log.debug(f'Flags: sd={env} ld={ld}') rec('args')