diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c8b34f18..3dc12c82b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -122,6 +122,8 @@ - Update stable-fast with support for torch 2.2.2 and 2.3.0, thanks @Aptronymist - Add torch *cudaMallocAsync* in compute options Can improve memory utilization on compatible GPUs (RTX and newer) + - Torch dynamic profiling + You can enable/disable full torch profiling in settings top menu on-the-fly - Support controlnet manually downloads models in both standalone and diffusers format For standalone, simply copy safetensors file to `models/control/controlnet` folder For diffusers format, create folder with model name in `models/control/controlnet/` diff --git a/modules/api/control.py b/modules/api/control.py index d7db1fe6f..7c0607b34 100644 --- a/modules/api/control.py +++ b/modules/api/control.py @@ -153,7 +153,7 @@ class APIControl(): # run with self.queue_lock: - shared.state.begin('api-control', api=True) + shared.state.begin('API-CTL', api=True) output_images = [] output_processed = [] output_info = '' diff --git a/modules/api/generate.py b/modules/api/generate.py index 6bfc33ab8..371f3c0bf 100644 --- a/modules/api/generate.py +++ b/modules/api/generate.py @@ -104,7 +104,7 @@ class APIGenerate(): p.scripts = script_runner p.outpath_grids = shared.opts.outdir_grids or shared.opts.outdir_txt2img_grids p.outpath_samples = shared.opts.outdir_samples or shared.opts.outdir_txt2img_samples - shared.state.begin('api-txt2img', api=True) + shared.state.begin('API TXT', api=True) script_args = script.init_script_args(p, txt2imgreq, self.default_script_arg_txt2img, selectable_scripts, selectable_script_idx, script_runner) if selectable_scripts is not None: processed = scripts.scripts_txt2img.run(p, *script_args) # Need to pass args as list here @@ -148,7 +148,7 @@ class APIGenerate(): p.scripts = script_runner p.outpath_grids = shared.opts.outdir_img2img_grids p.outpath_samples = shared.opts.outdir_img2img_samples - shared.state.begin('api-img2img', api=True) + shared.state.begin('API-IMG', api=True) script_args = script.init_script_args(p, img2imgreq, self.default_script_arg_img2img, selectable_scripts, selectable_script_idx, script_runner) if selectable_scripts is not None: processed = scripts.scripts_img2img.run(p, *script_args) # Need to pass args as list here diff --git a/modules/api/process.py b/modules/api/process.py index 343b6efb4..830aa14d9 100644 --- a/modules/api/process.py +++ b/modules/api/process.py @@ -64,7 +64,7 @@ class APIProcess(): for k, v in req.params.items(): if k not in processors.config[processor.processor_id]['params']: return JSONResponse(status_code=400, content={"error": f"Processor invalid parameter: id={req.model} {k}={v}"}) - shared.state.begin('api-preprocess', api=True) + shared.state.begin('API-PRE', api=True) processed = processor(image, local_config=req.params) image = encode_pil_to_base64(processed) shared.state.end(api=False) @@ -90,7 +90,7 @@ class APIProcess(): return JSONResponse(status_code=400, content={"error": f"Mask invalid parameter: {k}={v}"}) else: setattr(masking.opts, k, v) - shared.state.begin('api-mask', api=True) + shared.state.begin('API-MASK', api=True) with self.queue_lock: processed = masking.run_mask(input_image=image, input_mask=mask, return_type=req.type) shared.state.end(api=False) diff --git a/modules/errors.py b/modules/errors.py index ef650f52c..f3bf4319e 100644 --- a/modules/errors.py +++ b/modules/errors.py @@ -89,8 +89,18 @@ def profile(profiler, msg: str): def profile_torch(profiler, msg: str): profiler.stop() + lines = profiler.key_averages().table(sort_by="cpu_time_total", row_limit=12) + lines = lines.split('\n') + lines = [x for x in lines if '/profiler' not in x and '---' not in x] + txt = '\n'.join(lines) + log.debug(f'Torch profile CPU-total {msg}: \n{txt}') lines = profiler.key_averages().table(sort_by="self_cpu_time_total", row_limit=12) lines = lines.split('\n') lines = [x for x in lines if '/profiler' not in x and '---' not in x] txt = '\n'.join(lines) - log.debug(f'Torch profile {msg}: \n{txt}') + log.debug(f'Torch profile CPU-self {msg}: \n{txt}') + lines = profiler.key_averages().table(sort_by="cuda_time_total", row_limit=12) + lines = lines.split('\n') + lines = [x for x in lines if '/profiler' not in x and '---' not in x] + txt = '\n'.join(lines) + log.debug(f'Torch profile CUDA {msg}: \n{txt}') diff --git a/modules/extras.py b/modules/extras.py index 150bcd8d0..cc646990d 100644 --- a/modules/extras.py +++ b/modules/extras.py @@ -54,7 +54,7 @@ def to_half(tensor, enable): def run_modelmerger(id_task, **kwargs): # pylint: disable=unused-argument - shared.state.begin('merge') + shared.state.begin('Merge') t0 = time.time() def fail(message): @@ -284,7 +284,7 @@ def run_modelconvert(model, checkpoint_formats, precision, conv_type, custom_nam "vae": vae_conv, "other": others_conv } - shared.state.begin('convert') + shared.state.begin('Convert') model_info = sd_models.checkpoints_list[model] shared.state.textinfo = f"Loading {model_info.filename}..." shared.log.info(f"Model convert loading: {model_info.filename}") diff --git a/modules/hashes.py b/modules/hashes.py index 607140fe6..cf83794b0 100644 --- a/modules/hashes.py +++ b/modules/hashes.py @@ -69,7 +69,7 @@ def sha256(filename, title, use_addnet_hash=False): if not os.path.isfile(filename): return None orig_state = copy.deepcopy(shared.state) - shared.state.begin("hash") + shared.state.begin("Hash") if use_addnet_hash: if progress_ok: try: diff --git a/modules/interrogate.py b/modules/interrogate.py index fdd23fd36..ae4cd2926 100644 --- a/modules/interrogate.py +++ b/modules/interrogate.py @@ -163,7 +163,7 @@ class InterrogateModels: def interrogate(self, pil_image): res = "" - shared.state.begin('interrogate') + shared.state.begin('Interrogate') try: if shared.backend == shared.Backend.ORIGINAL and (shared.cmd_opts.lowvram or shared.cmd_opts.medvram): lowvram.send_everything_to_cpu() @@ -267,8 +267,7 @@ def interrogate(image, mode, caption=None): def interrogate_image(image, model, mode): - shared.state.begin() - shared.state.job = 'interrogate' + shared.state.begin('Interrogate') try: if shared.backend == shared.Backend.ORIGINAL and (shared.cmd_opts.lowvram or shared.cmd_opts.medvram): lowvram.send_everything_to_cpu() @@ -295,8 +294,7 @@ def interrogate_batch(batch_files, batch_folder, batch_str, model, mode, write): if len(files) == 0: shared.log.error('Interrogate batch no images') return '' - shared.state.begin() - shared.state.job = 'batch interrogate' + shared.state.begin('Batch interrogate') prompts = [] try: if shared.backend == shared.Backend.ORIGINAL and (shared.cmd_opts.lowvram or shared.cmd_opts.medvram): diff --git a/modules/modelloader.py b/modules/modelloader.py index f541281a1..71114ed3c 100644 --- a/modules/modelloader.py +++ b/modules/modelloader.py @@ -45,7 +45,7 @@ def download_civit_preview(model_path: str, preview_url: str): block_size = 16384 # 16KB blocks written = 0 img = None - shared.state.begin('civitai') + shared.state.begin('CivitAI') try: with open(preview_file, 'wb') as f: with p.Progress(p.TextColumn('[cyan]{task.description}'), p.DownloadColumn(), p.BarColumn(), p.TaskProgressColumn(), p.TimeRemainingColumn(), p.TimeElapsedColumn(), p.TransferSpeedColumn(), console=shared.console) as progress: @@ -107,7 +107,7 @@ def download_civit_model_thread(model_name, model_url, model_path, model_type, t total_size = int(r.headers.get('content-length', 0)) res += f' size={round((starting_pos + total_size)/1024/1024, 2)}Mb' shared.log.info(res) - shared.state.begin('civitai') + shared.state.begin('CivitAI') block_size = 16384 # 16KB blocks written = starting_pos global download_pbar # pylint: disable=global-statement @@ -162,7 +162,7 @@ def download_diffusers_model(hub_id: str, cache_dir: str = None, download_config return None from diffusers import DiffusionPipeline import huggingface_hub as hf - shared.state.begin('huggingface') + shared.state.begin('HuggingFace') if download_config is None: download_config = { "force_download": False, diff --git a/modules/postprocessing.py b/modules/postprocessing.py index db3645d75..4e307bec3 100644 --- a/modules/postprocessing.py +++ b/modules/postprocessing.py @@ -10,7 +10,7 @@ from modules.shared import opts def run_postprocessing(extras_mode, image, image_folder: List[tempfile.NamedTemporaryFile], input_dir, output_dir, show_extras_results, *args, save_output: bool = True): devices.torch_gc() - shared.state.begin('extras') + shared.state.begin('Extras') image_data = [] image_names = [] image_fullnames = [] diff --git a/modules/processing_diffusers.py b/modules/processing_diffusers.py index 779b5352a..2fe0a3b3b 100644 --- a/modules/processing_diffusers.py +++ b/modules/processing_diffusers.py @@ -169,7 +169,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing): p.ops.append('upscale') if shared.opts.save and not p.do_not_save_samples and shared.opts.save_images_before_highres_fix and hasattr(shared.sd_model, 'vae'): save_intermediate(p, latents=output.images, suffix="-before-hires") - shared.state.job = 'upscale' + shared.state.job = 'Upscale' output.images = resize_hires(p, latents=output.images) sd_hijack_hypertile.hypertile_set(p, hr=True) @@ -211,7 +211,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing): desc='Hires', ) update_sampler(p, shared.sd_model, second_pass=True) - shared.state.job = 'hires' + shared.state.job = 'HiRes' shared.state.sampling_steps = hires_args.get('num_inference_steps', None) or p.steps try: sd_models_compile.check_deepcache(enable=True) @@ -230,7 +230,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing): # optional refiner pass or decode if is_refiner_enabled(): prev_job = shared.state.job - shared.state.job = 'refine' + shared.state.job = 'Refine' shared.state.job_count +=1 if shared.opts.save and not p.do_not_save_samples and shared.opts.save_images_before_refiner and hasattr(shared.sd_model, 'vae'): save_intermediate(p, latents=output.images, suffix="-before-refiner") diff --git a/modules/processing_helpers.py b/modules/processing_helpers.py index 30d23d311..3d4ae2a43 100644 --- a/modules/processing_helpers.py +++ b/modules/processing_helpers.py @@ -191,7 +191,7 @@ def decode_first_stage(model, x, full_quality=True): x_sample = torch.zeros((len(x), 3, x.shape[2] * 8, x.shape[3] * 8), dtype=devices.dtype_vae, device=devices.device) return x_sample prev_job = shared.state.job - shared.state.job = 'vae' + shared.state.job = 'VAE' with devices.autocast(disable = x.dtype==devices.dtype_vae): try: if full_quality: diff --git a/modules/processing_original.py b/modules/processing_original.py index 0f61cf41b..822c2ed2d 100644 --- a/modules/processing_original.py +++ b/modules/processing_original.py @@ -97,7 +97,7 @@ def sample_txt2img(p: processing.StableDiffusionProcessingTxt2Img, conditioning, p.extra_generation_params, p.restore_faces = bak_extra_generation_params, bak_restore_faces images.save_image(image, p.outpath_samples, "", seeds[i], prompts[i], shared.opts.samples_format, info=info, suffix="-before-hires") if latent_scale_mode is None or p.hr_force: # non-latent upscaling - shared.state.job = 'upscale' + shared.state.job = 'Upscale' if decoded_samples is None: decoded_samples = decode_first_stage(p.sd_model, samples.to(dtype=devices.dtype_vae), p.full_quality) decoded_samples = torch.clamp((decoded_samples + 1.0) / 2.0, min=0.0, max=1.0) @@ -126,7 +126,7 @@ def sample_txt2img(p: processing.StableDiffusionProcessingTxt2Img, conditioning, if p.hr_sampler_name == "PLMS": p.hr_sampler_name = 'UniPC' if p.hr_force or latent_scale_mode is not None: - shared.state.job = 'hires' + shared.state.job = 'HiRes' if p.denoising_strength > 0: p.ops.append('hires') devices.torch_gc() # GC now before running the next img2img to prevent running out of memory diff --git a/modules/processing_vae.py b/modules/processing_vae.py index 1120b4ba1..972b6b7b8 100644 --- a/modules/processing_vae.py +++ b/modules/processing_vae.py @@ -109,7 +109,7 @@ def taesd_vae_encode(image): def vae_decode(latents, model, output_type='np', full_quality=True): t0 = time.time() prev_job = shared.state.job - shared.state.job = 'vae' + shared.state.job = 'VAE' if not torch.is_tensor(latents): # already decoded return latents if latents.shape[0] == 0: diff --git a/modules/sd_models.py b/modules/sd_models.py index 3b476d4f6..a4279f6b6 100644 --- a/modules/sd_models.py +++ b/modules/sd_models.py @@ -1487,7 +1487,7 @@ def reload_model_weights(sd_model=None, info=None, reuse_dict=False, op='model') return None orig_state = copy.deepcopy(shared.state) shared.state = shared_state.State() - shared.state.begin('load') + shared.state.begin('Load') if load_dict: shared.log.debug(f'Model dict: existing={sd_model is not None} target={checkpoint_info.filename} info={info}') else: diff --git a/modules/ui.py b/modules/ui.py index 45f7ed79f..71819947a 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -278,6 +278,7 @@ def create_ui(startup_timer = None): shutdown_submit = gr.Button(value="Shutdown server", variant='primary', elem_id="shutdown_submit") unload_sd_model = gr.Button(value='Unload checkpoint', variant='primary', elem_id="sett_unload_sd_model") reload_sd_model = gr.Button(value='Reload checkpoint', variant='primary', elem_id="sett_reload_sd_model") + enable_profiling = gr.Button(value='Start profiling', variant='primary', elem_id="start_profiling") with gr.Tabs(elem_id="system") as system_tabs: global ui_system_tabs # pylint: disable=global-statement @@ -363,8 +364,13 @@ def create_ui(startup_timer = None): def reload_sd_weights(): modules.sd_models.reload_model_weights() + def switch_profiling(): + shared.cmd_opts.profile = not shared.cmd_opts.profile + shared.log.warning(f'Profiling: {shared.cmd_opts.profile}') + unload_sd_model.click(fn=unload_sd_weights, inputs=[], outputs=[]) reload_sd_model.click(fn=reload_sd_weights, inputs=[], outputs=[]) + enable_profiling.click(fn=switch_profiling, inputs=[], outputs=[]) request_notifications.click(fn=lambda: None, inputs=[], outputs=[], _js='function(){}') preview_theme.click(fn=None, _js='previewTheme', inputs=[], outputs=[]) diff --git a/modules/ui_control.py b/modules/ui_control.py index f63e19baf..d373cb26c 100644 --- a/modules/ui_control.py +++ b/modules/ui_control.py @@ -46,7 +46,7 @@ def generate_click(job_id: str, active_tab: str, *args): time.sleep(0.01) from modules.control.run import control_run debug(f'Control: tab="{active_tab}" job={job_id} args={args}') - shared.state.begin('control') + shared.state.begin('Generate') progress.add_task_to_queue(job_id) with call_queue.queue_lock: yield [None, None, None, None, 'Control: starting'] diff --git a/modules/upscaler.py b/modules/upscaler.py index 2f66c5d34..744e60052 100644 --- a/modules/upscaler.py +++ b/modules/upscaler.py @@ -95,7 +95,7 @@ class Upscaler: def upscale(self, img: Image, scale, selected_model: str = None): orig_state = copy.deepcopy(shared.state) - shared.state.begin('upscale') + shared.state.begin('Upscale') self.scale = scale dest_w = int(img.width * scale) dest_h = int(img.height * scale) diff --git a/webui.py b/webui.py index 0f88cf5d1..718f8b0c8 100644 --- a/webui.py +++ b/webui.py @@ -159,7 +159,7 @@ def load_model(): if not opts.sd_checkpoint_autoload or (shared.cmd_opts.ckpt is not None and shared.cmd_opts.ckpt.lower() != 'none'): log.debug('Model auto load disabled') else: - shared.state.begin('load') + shared.state.begin('Load') thread_model = Thread(target=lambda: shared.sd_model) thread_model.start() thread_refiner = Thread(target=lambda: shared.sd_refiner)