torch dynamic profiling

2024-05-16 18:40:18 -04:00 · 2024-05-16 18:40:18 -04:00 · 554e5c8224
parent d63f35e298
commit 554e5c8224
19 changed files with 45 additions and 29 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -122,6 +122,8 @@
  - Update stable-fast with support for torch 2.2.2 and 2.3.0, thanks @Aptronymist
  - Add torch *cudaMallocAsync* in compute options  
    Can improve memory utilization on compatible GPUs (RTX and newer)  
+  - Torch dynamic profiling  
+    You can enable/disable full torch profiling in settings top menu on-the-fly  
  - Support controlnet manually downloads models in both standalone and diffusers format  
    For standalone, simply copy safetensors file to `models/control/controlnet` folder  
    For diffusers format, create folder with model name in `models/control/controlnet/`  
--- a/modules/api/control.py
+++ b/modules/api/control.py
@ -153,7 +153,7 @@ class APIControl():

        # run
        with self.queue_lock:
-            shared.state.begin('api-control', api=True)
+            shared.state.begin('API-CTL', api=True)
            output_images = []
            output_processed = []
            output_info = ''
--- a/modules/api/generate.py
+++ b/modules/api/generate.py
@ -104,7 +104,7 @@ class APIGenerate():
            p.scripts = script_runner
            p.outpath_grids = shared.opts.outdir_grids or shared.opts.outdir_txt2img_grids
            p.outpath_samples = shared.opts.outdir_samples or shared.opts.outdir_txt2img_samples
-            shared.state.begin('api-txt2img', api=True)
+            shared.state.begin('API TXT', api=True)
            script_args = script.init_script_args(p, txt2imgreq, self.default_script_arg_txt2img, selectable_scripts, selectable_script_idx, script_runner)
            if selectable_scripts is not None:
                processed = scripts.scripts_txt2img.run(p, *script_args) # Need to pass args as list here
@ -148,7 +148,7 @@ class APIGenerate():
            p.scripts = script_runner
            p.outpath_grids = shared.opts.outdir_img2img_grids
            p.outpath_samples = shared.opts.outdir_img2img_samples
-            shared.state.begin('api-img2img', api=True)
+            shared.state.begin('API-IMG', api=True)
            script_args = script.init_script_args(p, img2imgreq, self.default_script_arg_img2img, selectable_scripts, selectable_script_idx, script_runner)
            if selectable_scripts is not None:
                processed = scripts.scripts_img2img.run(p, *script_args) # Need to pass args as list here
--- a/modules/api/process.py
+++ b/modules/api/process.py
@ -64,7 +64,7 @@ class APIProcess():
        for k, v in req.params.items():
            if k not in processors.config[processor.processor_id]['params']:
                return JSONResponse(status_code=400, content={"error": f"Processor invalid parameter: id={req.model} {k}={v}"})
-        shared.state.begin('api-preprocess', api=True)
+        shared.state.begin('API-PRE', api=True)
        processed = processor(image, local_config=req.params)
        image = encode_pil_to_base64(processed)
        shared.state.end(api=False)
@ -90,7 +90,7 @@ class APIProcess():
                return JSONResponse(status_code=400, content={"error": f"Mask invalid parameter: {k}={v}"})
            else:
                setattr(masking.opts, k, v)
-        shared.state.begin('api-mask', api=True)
+        shared.state.begin('API-MASK', api=True)
        with self.queue_lock:
            processed = masking.run_mask(input_image=image, input_mask=mask, return_type=req.type)
        shared.state.end(api=False)
--- a/modules/errors.py
+++ b/modules/errors.py
@ -89,8 +89,18 @@ def profile(profiler, msg: str):

 def profile_torch(profiler, msg: str):
    profiler.stop()
+    lines = profiler.key_averages().table(sort_by="cpu_time_total", row_limit=12)
+    lines = lines.split('\n')
+    lines = [x for x in lines if '/profiler' not in x and '---' not in x]
+    txt = '\n'.join(lines)
+    log.debug(f'Torch profile CPU-total {msg}: \n{txt}')
    lines = profiler.key_averages().table(sort_by="self_cpu_time_total", row_limit=12)
    lines = lines.split('\n')
    lines = [x for x in lines if '/profiler' not in x and '---' not in x]
    txt = '\n'.join(lines)
-    log.debug(f'Torch profile {msg}: \n{txt}')
+    log.debug(f'Torch profile CPU-self {msg}: \n{txt}')
+    lines = profiler.key_averages().table(sort_by="cuda_time_total", row_limit=12)
+    lines = lines.split('\n')
+    lines = [x for x in lines if '/profiler' not in x and '---' not in x]
+    txt = '\n'.join(lines)
+    log.debug(f'Torch profile CUDA {msg}: \n{txt}')
--- a/modules/extras.py
+++ b/modules/extras.py
@ -54,7 +54,7 @@ def to_half(tensor, enable):


 def run_modelmerger(id_task, **kwargs):  # pylint: disable=unused-argument
-    shared.state.begin('merge')
+    shared.state.begin('Merge')
    t0 = time.time()

    def fail(message):
@ -284,7 +284,7 @@ def run_modelconvert(model, checkpoint_formats, precision, conv_type, custom_nam
        "vae": vae_conv,
        "other": others_conv
    }
-    shared.state.begin('convert')
+    shared.state.begin('Convert')
    model_info = sd_models.checkpoints_list[model]
    shared.state.textinfo = f"Loading {model_info.filename}..."
    shared.log.info(f"Model convert loading: {model_info.filename}")
--- a/modules/hashes.py
+++ b/modules/hashes.py
@ -69,7 +69,7 @@ def sha256(filename, title, use_addnet_hash=False):
    if not os.path.isfile(filename):
        return None
    orig_state = copy.deepcopy(shared.state)
-    shared.state.begin("hash")
+    shared.state.begin("Hash")
    if use_addnet_hash:
        if progress_ok:
            try:
--- a/modules/interrogate.py
+++ b/modules/interrogate.py
@ -163,7 +163,7 @@ class InterrogateModels:

    def interrogate(self, pil_image):
        res = ""
-        shared.state.begin('interrogate')
+        shared.state.begin('Interrogate')
        try:
            if shared.backend == shared.Backend.ORIGINAL and (shared.cmd_opts.lowvram or shared.cmd_opts.medvram):
                lowvram.send_everything_to_cpu()
@ -267,8 +267,7 @@ def interrogate(image, mode, caption=None):


 def interrogate_image(image, model, mode):
-    shared.state.begin()
-    shared.state.job = 'interrogate'
+    shared.state.begin('Interrogate')
    try:
        if shared.backend == shared.Backend.ORIGINAL and (shared.cmd_opts.lowvram or shared.cmd_opts.medvram):
            lowvram.send_everything_to_cpu()
@ -295,8 +294,7 @@ def interrogate_batch(batch_files, batch_folder, batch_str, model, mode, write):
    if len(files) == 0:
        shared.log.error('Interrogate batch no images')
        return ''
-    shared.state.begin()
-    shared.state.job = 'batch interrogate'
+    shared.state.begin('Batch interrogate')
    prompts = []
    try:
        if shared.backend == shared.Backend.ORIGINAL and (shared.cmd_opts.lowvram or shared.cmd_opts.medvram):
--- a/modules/modelloader.py
+++ b/modules/modelloader.py
@ -45,7 +45,7 @@ def download_civit_preview(model_path: str, preview_url: str):
    block_size = 16384 # 16KB blocks
    written = 0
    img = None
-    shared.state.begin('civitai')
+    shared.state.begin('CivitAI')
    try:
        with open(preview_file, 'wb') as f:
            with p.Progress(p.TextColumn('[cyan]{task.description}'), p.DownloadColumn(), p.BarColumn(), p.TaskProgressColumn(), p.TimeRemainingColumn(), p.TimeElapsedColumn(), p.TransferSpeedColumn(), console=shared.console) as progress:
@ -107,7 +107,7 @@ def download_civit_model_thread(model_name, model_url, model_path, model_type, t
    total_size = int(r.headers.get('content-length', 0))
    res += f' size={round((starting_pos + total_size)/1024/1024, 2)}Mb'
    shared.log.info(res)
-    shared.state.begin('civitai')
+    shared.state.begin('CivitAI')
    block_size = 16384 # 16KB blocks
    written = starting_pos
    global download_pbar # pylint: disable=global-statement
@ -162,7 +162,7 @@ def download_diffusers_model(hub_id: str, cache_dir: str = None, download_config
        return None
    from diffusers import DiffusionPipeline
    import huggingface_hub as hf
-    shared.state.begin('huggingface')
+    shared.state.begin('HuggingFace')
    if download_config is None:
        download_config = {
            "force_download": False,
--- a/modules/postprocessing.py
+++ b/modules/postprocessing.py
@ -10,7 +10,7 @@ from modules.shared import opts

 def run_postprocessing(extras_mode, image, image_folder: List[tempfile.NamedTemporaryFile], input_dir, output_dir, show_extras_results, *args, save_output: bool = True):
    devices.torch_gc()
-    shared.state.begin('extras')
+    shared.state.begin('Extras')
    image_data = []
    image_names = []
    image_fullnames = []
--- a/modules/processing_diffusers.py
+++ b/modules/processing_diffusers.py
@ -169,7 +169,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
            p.ops.append('upscale')
            if shared.opts.save and not p.do_not_save_samples and shared.opts.save_images_before_highres_fix and hasattr(shared.sd_model, 'vae'):
                save_intermediate(p, latents=output.images, suffix="-before-hires")
-            shared.state.job = 'upscale'
+            shared.state.job = 'Upscale'
            output.images = resize_hires(p, latents=output.images)
            sd_hijack_hypertile.hypertile_set(p, hr=True)

@ -211,7 +211,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
                desc='Hires',
            )
            update_sampler(p, shared.sd_model, second_pass=True)
-            shared.state.job = 'hires'
+            shared.state.job = 'HiRes'
            shared.state.sampling_steps = hires_args.get('num_inference_steps', None) or p.steps
            try:
                sd_models_compile.check_deepcache(enable=True)
@ -230,7 +230,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
    # optional refiner pass or decode
    if is_refiner_enabled():
        prev_job = shared.state.job
-        shared.state.job = 'refine'
+        shared.state.job = 'Refine'
        shared.state.job_count +=1
        if shared.opts.save and not p.do_not_save_samples and shared.opts.save_images_before_refiner and hasattr(shared.sd_model, 'vae'):
            save_intermediate(p, latents=output.images, suffix="-before-refiner")
--- a/modules/processing_helpers.py
+++ b/modules/processing_helpers.py
@ -191,7 +191,7 @@ def decode_first_stage(model, x, full_quality=True):
        x_sample = torch.zeros((len(x), 3, x.shape[2] * 8, x.shape[3] * 8), dtype=devices.dtype_vae, device=devices.device)
        return x_sample
    prev_job = shared.state.job
-    shared.state.job = 'vae'
+    shared.state.job = 'VAE'
    with devices.autocast(disable = x.dtype==devices.dtype_vae):
        try:
            if full_quality:
--- a/modules/processing_original.py
+++ b/modules/processing_original.py
@ -97,7 +97,7 @@ def sample_txt2img(p: processing.StableDiffusionProcessingTxt2Img, conditioning,
                p.extra_generation_params, p.restore_faces = bak_extra_generation_params, bak_restore_faces
                images.save_image(image, p.outpath_samples, "", seeds[i], prompts[i], shared.opts.samples_format, info=info, suffix="-before-hires")
        if latent_scale_mode is None or p.hr_force: # non-latent upscaling
-            shared.state.job = 'upscale'
+            shared.state.job = 'Upscale'
            if decoded_samples is None:
                decoded_samples = decode_first_stage(p.sd_model, samples.to(dtype=devices.dtype_vae), p.full_quality)
                decoded_samples = torch.clamp((decoded_samples + 1.0) / 2.0, min=0.0, max=1.0)
@ -126,7 +126,7 @@ def sample_txt2img(p: processing.StableDiffusionProcessingTxt2Img, conditioning,
            if p.hr_sampler_name == "PLMS":
                p.hr_sampler_name = 'UniPC'
        if p.hr_force or latent_scale_mode is not None:
-            shared.state.job = 'hires'
+            shared.state.job = 'HiRes'
            if p.denoising_strength > 0:
                p.ops.append('hires')
                devices.torch_gc() # GC now before running the next img2img to prevent running out of memory
--- a/modules/processing_vae.py
+++ b/modules/processing_vae.py
@ -109,7 +109,7 @@ def taesd_vae_encode(image):
 def vae_decode(latents, model, output_type='np', full_quality=True):
    t0 = time.time()
    prev_job = shared.state.job
-    shared.state.job = 'vae'
+    shared.state.job = 'VAE'
    if not torch.is_tensor(latents): # already decoded
        return latents
    if latents.shape[0] == 0:
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@ -1487,7 +1487,7 @@ def reload_model_weights(sd_model=None, info=None, reuse_dict=False, op='model')
        return None
    orig_state = copy.deepcopy(shared.state)
    shared.state = shared_state.State()
-    shared.state.begin('load')
+    shared.state.begin('Load')
    if load_dict:
        shared.log.debug(f'Model dict: existing={sd_model is not None} target={checkpoint_info.filename} info={info}')
    else:
--- a/modules/ui.py
+++ b/modules/ui.py
@ -278,6 +278,7 @@ def create_ui(startup_timer = None):
            shutdown_submit = gr.Button(value="Shutdown server", variant='primary', elem_id="shutdown_submit")
            unload_sd_model = gr.Button(value='Unload checkpoint', variant='primary', elem_id="sett_unload_sd_model")
            reload_sd_model = gr.Button(value='Reload checkpoint', variant='primary', elem_id="sett_reload_sd_model")
+            enable_profiling = gr.Button(value='Start profiling', variant='primary', elem_id="start_profiling")

        with gr.Tabs(elem_id="system") as system_tabs:
            global ui_system_tabs # pylint: disable=global-statement
@ -363,8 +364,13 @@ def create_ui(startup_timer = None):
        def reload_sd_weights():
            modules.sd_models.reload_model_weights()

+        def switch_profiling():
+            shared.cmd_opts.profile = not shared.cmd_opts.profile
+            shared.log.warning(f'Profiling: {shared.cmd_opts.profile}')
+
        unload_sd_model.click(fn=unload_sd_weights, inputs=[], outputs=[])
        reload_sd_model.click(fn=reload_sd_weights, inputs=[], outputs=[])
+        enable_profiling.click(fn=switch_profiling, inputs=[], outputs=[])
        request_notifications.click(fn=lambda: None, inputs=[], outputs=[], _js='function(){}')
        preview_theme.click(fn=None, _js='previewTheme', inputs=[], outputs=[])

--- a/modules/ui_control.py
+++ b/modules/ui_control.py
@ -46,7 +46,7 @@ def generate_click(job_id: str, active_tab: str, *args):
        time.sleep(0.01)
    from modules.control.run import control_run
    debug(f'Control: tab="{active_tab}" job={job_id} args={args}')
-    shared.state.begin('control')
+    shared.state.begin('Generate')
    progress.add_task_to_queue(job_id)
    with call_queue.queue_lock:
        yield [None, None, None, None, 'Control: starting']
--- a/modules/upscaler.py
+++ b/modules/upscaler.py
@ -95,7 +95,7 @@ class Upscaler:

    def upscale(self, img: Image, scale, selected_model: str = None):
        orig_state = copy.deepcopy(shared.state)
-        shared.state.begin('upscale')
+        shared.state.begin('Upscale')
        self.scale = scale
        dest_w = int(img.width * scale)
        dest_h = int(img.height * scale)
--- a/webui.py
+++ b/webui.py
@ -159,7 +159,7 @@ def load_model():
    if not opts.sd_checkpoint_autoload or (shared.cmd_opts.ckpt is not None and shared.cmd_opts.ckpt.lower() != 'none'):
        log.debug('Model auto load disabled')
    else:
-        shared.state.begin('load')
+        shared.state.begin('Load')
        thread_model = Thread(target=lambda: shared.sd_model)
        thread_model.start()
        thread_refiner = Thread(target=lambda: shared.sd_refiner)