diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9c8b34f18..3dc12c82b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -122,6 +122,8 @@
   - Update stable-fast with support for torch 2.2.2 and 2.3.0, thanks @Aptronymist
   - Add torch *cudaMallocAsync* in compute options  
     Can improve memory utilization on compatible GPUs (RTX and newer)  
+  - Torch dynamic profiling  
+    You can enable/disable full torch profiling in settings top menu on-the-fly  
   - Support controlnet manually downloads models in both standalone and diffusers format  
     For standalone, simply copy safetensors file to `models/control/controlnet` folder  
     For diffusers format, create folder with model name in `models/control/controlnet/`  
diff --git a/modules/api/control.py b/modules/api/control.py
index d7db1fe6f..7c0607b34 100644
--- a/modules/api/control.py
+++ b/modules/api/control.py
@@ -153,7 +153,7 @@ class APIControl():
 
         # run
         with self.queue_lock:
-            shared.state.begin('api-control', api=True)
+            shared.state.begin('API-CTL', api=True)
             output_images = []
             output_processed = []
             output_info = ''
diff --git a/modules/api/generate.py b/modules/api/generate.py
index 6bfc33ab8..371f3c0bf 100644
--- a/modules/api/generate.py
+++ b/modules/api/generate.py
@@ -104,7 +104,7 @@ class APIGenerate():
             p.scripts = script_runner
             p.outpath_grids = shared.opts.outdir_grids or shared.opts.outdir_txt2img_grids
             p.outpath_samples = shared.opts.outdir_samples or shared.opts.outdir_txt2img_samples
-            shared.state.begin('api-txt2img', api=True)
+            shared.state.begin('API TXT', api=True)
             script_args = script.init_script_args(p, txt2imgreq, self.default_script_arg_txt2img, selectable_scripts, selectable_script_idx, script_runner)
             if selectable_scripts is not None:
                 processed = scripts.scripts_txt2img.run(p, *script_args) # Need to pass args as list here
@@ -148,7 +148,7 @@ class APIGenerate():
             p.scripts = script_runner
             p.outpath_grids = shared.opts.outdir_img2img_grids
             p.outpath_samples = shared.opts.outdir_img2img_samples
-            shared.state.begin('api-img2img', api=True)
+            shared.state.begin('API-IMG', api=True)
             script_args = script.init_script_args(p, img2imgreq, self.default_script_arg_img2img, selectable_scripts, selectable_script_idx, script_runner)
             if selectable_scripts is not None:
                 processed = scripts.scripts_img2img.run(p, *script_args) # Need to pass args as list here
diff --git a/modules/api/process.py b/modules/api/process.py
index 343b6efb4..830aa14d9 100644
--- a/modules/api/process.py
+++ b/modules/api/process.py
@@ -64,7 +64,7 @@ class APIProcess():
         for k, v in req.params.items():
             if k not in processors.config[processor.processor_id]['params']:
                 return JSONResponse(status_code=400, content={"error": f"Processor invalid parameter: id={req.model} {k}={v}"})
-        shared.state.begin('api-preprocess', api=True)
+        shared.state.begin('API-PRE', api=True)
         processed = processor(image, local_config=req.params)
         image = encode_pil_to_base64(processed)
         shared.state.end(api=False)
@@ -90,7 +90,7 @@ class APIProcess():
                 return JSONResponse(status_code=400, content={"error": f"Mask invalid parameter: {k}={v}"})
             else:
                 setattr(masking.opts, k, v)
-        shared.state.begin('api-mask', api=True)
+        shared.state.begin('API-MASK', api=True)
         with self.queue_lock:
             processed = masking.run_mask(input_image=image, input_mask=mask, return_type=req.type)
         shared.state.end(api=False)
diff --git a/modules/errors.py b/modules/errors.py
index ef650f52c..f3bf4319e 100644
--- a/modules/errors.py
+++ b/modules/errors.py
@@ -89,8 +89,18 @@ def profile(profiler, msg: str):
 
 def profile_torch(profiler, msg: str):
     profiler.stop()
+    lines = profiler.key_averages().table(sort_by="cpu_time_total", row_limit=12)
+    lines = lines.split('\n')
+    lines = [x for x in lines if '/profiler' not in x and '---' not in x]
+    txt = '\n'.join(lines)
+    log.debug(f'Torch profile CPU-total {msg}: \n{txt}')
     lines = profiler.key_averages().table(sort_by="self_cpu_time_total", row_limit=12)
     lines = lines.split('\n')
     lines = [x for x in lines if '/profiler' not in x and '---' not in x]
     txt = '\n'.join(lines)
-    log.debug(f'Torch profile {msg}: \n{txt}')
+    log.debug(f'Torch profile CPU-self {msg}: \n{txt}')
+    lines = profiler.key_averages().table(sort_by="cuda_time_total", row_limit=12)
+    lines = lines.split('\n')
+    lines = [x for x in lines if '/profiler' not in x and '---' not in x]
+    txt = '\n'.join(lines)
+    log.debug(f'Torch profile CUDA {msg}: \n{txt}')
diff --git a/modules/extras.py b/modules/extras.py
index 150bcd8d0..cc646990d 100644
--- a/modules/extras.py
+++ b/modules/extras.py
@@ -54,7 +54,7 @@ def to_half(tensor, enable):
 
 
 def run_modelmerger(id_task, **kwargs):  # pylint: disable=unused-argument
-    shared.state.begin('merge')
+    shared.state.begin('Merge')
     t0 = time.time()
 
     def fail(message):
@@ -284,7 +284,7 @@ def run_modelconvert(model, checkpoint_formats, precision, conv_type, custom_nam
         "vae": vae_conv,
         "other": others_conv
     }
-    shared.state.begin('convert')
+    shared.state.begin('Convert')
     model_info = sd_models.checkpoints_list[model]
     shared.state.textinfo = f"Loading {model_info.filename}..."
     shared.log.info(f"Model convert loading: {model_info.filename}")
diff --git a/modules/hashes.py b/modules/hashes.py
index 607140fe6..cf83794b0 100644
--- a/modules/hashes.py
+++ b/modules/hashes.py
@@ -69,7 +69,7 @@ def sha256(filename, title, use_addnet_hash=False):
     if not os.path.isfile(filename):
         return None
     orig_state = copy.deepcopy(shared.state)
-    shared.state.begin("hash")
+    shared.state.begin("Hash")
     if use_addnet_hash:
         if progress_ok:
             try:
diff --git a/modules/interrogate.py b/modules/interrogate.py
index fdd23fd36..ae4cd2926 100644
--- a/modules/interrogate.py
+++ b/modules/interrogate.py
@@ -163,7 +163,7 @@ class InterrogateModels:
 
     def interrogate(self, pil_image):
         res = ""
-        shared.state.begin('interrogate')
+        shared.state.begin('Interrogate')
         try:
             if shared.backend == shared.Backend.ORIGINAL and (shared.cmd_opts.lowvram or shared.cmd_opts.medvram):
                 lowvram.send_everything_to_cpu()
@@ -267,8 +267,7 @@ def interrogate(image, mode, caption=None):
 
 
 def interrogate_image(image, model, mode):
-    shared.state.begin()
-    shared.state.job = 'interrogate'
+    shared.state.begin('Interrogate')
     try:
         if shared.backend == shared.Backend.ORIGINAL and (shared.cmd_opts.lowvram or shared.cmd_opts.medvram):
             lowvram.send_everything_to_cpu()
@@ -295,8 +294,7 @@ def interrogate_batch(batch_files, batch_folder, batch_str, model, mode, write):
     if len(files) == 0:
         shared.log.error('Interrogate batch no images')
         return ''
-    shared.state.begin()
-    shared.state.job = 'batch interrogate'
+    shared.state.begin('Batch interrogate')
     prompts = []
     try:
         if shared.backend == shared.Backend.ORIGINAL and (shared.cmd_opts.lowvram or shared.cmd_opts.medvram):
diff --git a/modules/modelloader.py b/modules/modelloader.py
index f541281a1..71114ed3c 100644
--- a/modules/modelloader.py
+++ b/modules/modelloader.py
@@ -45,7 +45,7 @@ def download_civit_preview(model_path: str, preview_url: str):
     block_size = 16384 # 16KB blocks
     written = 0
     img = None
-    shared.state.begin('civitai')
+    shared.state.begin('CivitAI')
     try:
         with open(preview_file, 'wb') as f:
             with p.Progress(p.TextColumn('[cyan]{task.description}'), p.DownloadColumn(), p.BarColumn(), p.TaskProgressColumn(), p.TimeRemainingColumn(), p.TimeElapsedColumn(), p.TransferSpeedColumn(), console=shared.console) as progress:
@@ -107,7 +107,7 @@ def download_civit_model_thread(model_name, model_url, model_path, model_type, t
     total_size = int(r.headers.get('content-length', 0))
     res += f' size={round((starting_pos + total_size)/1024/1024, 2)}Mb'
     shared.log.info(res)
-    shared.state.begin('civitai')
+    shared.state.begin('CivitAI')
     block_size = 16384 # 16KB blocks
     written = starting_pos
     global download_pbar # pylint: disable=global-statement
@@ -162,7 +162,7 @@ def download_diffusers_model(hub_id: str, cache_dir: str = None, download_config
         return None
     from diffusers import DiffusionPipeline
     import huggingface_hub as hf
-    shared.state.begin('huggingface')
+    shared.state.begin('HuggingFace')
     if download_config is None:
         download_config = {
             "force_download": False,
diff --git a/modules/postprocessing.py b/modules/postprocessing.py
index db3645d75..4e307bec3 100644
--- a/modules/postprocessing.py
+++ b/modules/postprocessing.py
@@ -10,7 +10,7 @@ from modules.shared import opts
 
 def run_postprocessing(extras_mode, image, image_folder: List[tempfile.NamedTemporaryFile], input_dir, output_dir, show_extras_results, *args, save_output: bool = True):
     devices.torch_gc()
-    shared.state.begin('extras')
+    shared.state.begin('Extras')
     image_data = []
     image_names = []
     image_fullnames = []
diff --git a/modules/processing_diffusers.py b/modules/processing_diffusers.py
index 779b5352a..2fe0a3b3b 100644
--- a/modules/processing_diffusers.py
+++ b/modules/processing_diffusers.py
@@ -169,7 +169,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
             p.ops.append('upscale')
             if shared.opts.save and not p.do_not_save_samples and shared.opts.save_images_before_highres_fix and hasattr(shared.sd_model, 'vae'):
                 save_intermediate(p, latents=output.images, suffix="-before-hires")
-            shared.state.job = 'upscale'
+            shared.state.job = 'Upscale'
             output.images = resize_hires(p, latents=output.images)
             sd_hijack_hypertile.hypertile_set(p, hr=True)
 
@@ -211,7 +211,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
                 desc='Hires',
             )
             update_sampler(p, shared.sd_model, second_pass=True)
-            shared.state.job = 'hires'
+            shared.state.job = 'HiRes'
             shared.state.sampling_steps = hires_args.get('num_inference_steps', None) or p.steps
             try:
                 sd_models_compile.check_deepcache(enable=True)
@@ -230,7 +230,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
     # optional refiner pass or decode
     if is_refiner_enabled():
         prev_job = shared.state.job
-        shared.state.job = 'refine'
+        shared.state.job = 'Refine'
         shared.state.job_count +=1
         if shared.opts.save and not p.do_not_save_samples and shared.opts.save_images_before_refiner and hasattr(shared.sd_model, 'vae'):
             save_intermediate(p, latents=output.images, suffix="-before-refiner")
diff --git a/modules/processing_helpers.py b/modules/processing_helpers.py
index 30d23d311..3d4ae2a43 100644
--- a/modules/processing_helpers.py
+++ b/modules/processing_helpers.py
@@ -191,7 +191,7 @@ def decode_first_stage(model, x, full_quality=True):
         x_sample = torch.zeros((len(x), 3, x.shape[2] * 8, x.shape[3] * 8), dtype=devices.dtype_vae, device=devices.device)
         return x_sample
     prev_job = shared.state.job
-    shared.state.job = 'vae'
+    shared.state.job = 'VAE'
     with devices.autocast(disable = x.dtype==devices.dtype_vae):
         try:
             if full_quality:
diff --git a/modules/processing_original.py b/modules/processing_original.py
index 0f61cf41b..822c2ed2d 100644
--- a/modules/processing_original.py
+++ b/modules/processing_original.py
@@ -97,7 +97,7 @@ def sample_txt2img(p: processing.StableDiffusionProcessingTxt2Img, conditioning,
                 p.extra_generation_params, p.restore_faces = bak_extra_generation_params, bak_restore_faces
                 images.save_image(image, p.outpath_samples, "", seeds[i], prompts[i], shared.opts.samples_format, info=info, suffix="-before-hires")
         if latent_scale_mode is None or p.hr_force: # non-latent upscaling
-            shared.state.job = 'upscale'
+            shared.state.job = 'Upscale'
             if decoded_samples is None:
                 decoded_samples = decode_first_stage(p.sd_model, samples.to(dtype=devices.dtype_vae), p.full_quality)
                 decoded_samples = torch.clamp((decoded_samples + 1.0) / 2.0, min=0.0, max=1.0)
@@ -126,7 +126,7 @@ def sample_txt2img(p: processing.StableDiffusionProcessingTxt2Img, conditioning,
             if p.hr_sampler_name == "PLMS":
                 p.hr_sampler_name = 'UniPC'
         if p.hr_force or latent_scale_mode is not None:
-            shared.state.job = 'hires'
+            shared.state.job = 'HiRes'
             if p.denoising_strength > 0:
                 p.ops.append('hires')
                 devices.torch_gc() # GC now before running the next img2img to prevent running out of memory
diff --git a/modules/processing_vae.py b/modules/processing_vae.py
index 1120b4ba1..972b6b7b8 100644
--- a/modules/processing_vae.py
+++ b/modules/processing_vae.py
@@ -109,7 +109,7 @@ def taesd_vae_encode(image):
 def vae_decode(latents, model, output_type='np', full_quality=True):
     t0 = time.time()
     prev_job = shared.state.job
-    shared.state.job = 'vae'
+    shared.state.job = 'VAE'
     if not torch.is_tensor(latents): # already decoded
         return latents
     if latents.shape[0] == 0:
diff --git a/modules/sd_models.py b/modules/sd_models.py
index 3b476d4f6..a4279f6b6 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -1487,7 +1487,7 @@ def reload_model_weights(sd_model=None, info=None, reuse_dict=False, op='model')
         return None
     orig_state = copy.deepcopy(shared.state)
     shared.state = shared_state.State()
-    shared.state.begin('load')
+    shared.state.begin('Load')
     if load_dict:
         shared.log.debug(f'Model dict: existing={sd_model is not None} target={checkpoint_info.filename} info={info}')
     else:
diff --git a/modules/ui.py b/modules/ui.py
index 45f7ed79f..71819947a 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -278,6 +278,7 @@ def create_ui(startup_timer = None):
             shutdown_submit = gr.Button(value="Shutdown server", variant='primary', elem_id="shutdown_submit")
             unload_sd_model = gr.Button(value='Unload checkpoint', variant='primary', elem_id="sett_unload_sd_model")
             reload_sd_model = gr.Button(value='Reload checkpoint', variant='primary', elem_id="sett_reload_sd_model")
+            enable_profiling = gr.Button(value='Start profiling', variant='primary', elem_id="start_profiling")
 
         with gr.Tabs(elem_id="system") as system_tabs:
             global ui_system_tabs # pylint: disable=global-statement
@@ -363,8 +364,13 @@ def create_ui(startup_timer = None):
         def reload_sd_weights():
             modules.sd_models.reload_model_weights()
 
+        def switch_profiling():
+            shared.cmd_opts.profile = not shared.cmd_opts.profile
+            shared.log.warning(f'Profiling: {shared.cmd_opts.profile}')
+
         unload_sd_model.click(fn=unload_sd_weights, inputs=[], outputs=[])
         reload_sd_model.click(fn=reload_sd_weights, inputs=[], outputs=[])
+        enable_profiling.click(fn=switch_profiling, inputs=[], outputs=[])
         request_notifications.click(fn=lambda: None, inputs=[], outputs=[], _js='function(){}')
         preview_theme.click(fn=None, _js='previewTheme', inputs=[], outputs=[])
 
diff --git a/modules/ui_control.py b/modules/ui_control.py
index f63e19baf..d373cb26c 100644
--- a/modules/ui_control.py
+++ b/modules/ui_control.py
@@ -46,7 +46,7 @@ def generate_click(job_id: str, active_tab: str, *args):
         time.sleep(0.01)
     from modules.control.run import control_run
     debug(f'Control: tab="{active_tab}" job={job_id} args={args}')
-    shared.state.begin('control')
+    shared.state.begin('Generate')
     progress.add_task_to_queue(job_id)
     with call_queue.queue_lock:
         yield [None, None, None, None, 'Control: starting']
diff --git a/modules/upscaler.py b/modules/upscaler.py
index 2f66c5d34..744e60052 100644
--- a/modules/upscaler.py
+++ b/modules/upscaler.py
@@ -95,7 +95,7 @@ class Upscaler:
 
     def upscale(self, img: Image, scale, selected_model: str = None):
         orig_state = copy.deepcopy(shared.state)
-        shared.state.begin('upscale')
+        shared.state.begin('Upscale')
         self.scale = scale
         dest_w = int(img.width * scale)
         dest_h = int(img.height * scale)
diff --git a/webui.py b/webui.py
index 0f88cf5d1..718f8b0c8 100644
--- a/webui.py
+++ b/webui.py
@@ -159,7 +159,7 @@ def load_model():
     if not opts.sd_checkpoint_autoload or (shared.cmd_opts.ckpt is not None and shared.cmd_opts.ckpt.lower() != 'none'):
         log.debug('Model auto load disabled')
     else:
-        shared.state.begin('load')
+        shared.state.begin('Load')
         thread_model = Thread(target=lambda: shared.sd_model)
         thread_model.start()
         thread_refiner = Thread(target=lambda: shared.sd_refiner)