torch dynamic profiling

pull/3153/head
Vladimir Mandic 2024-05-16 18:40:18 -04:00
parent d63f35e298
commit 554e5c8224
19 changed files with 45 additions and 29 deletions

View File

@ -122,6 +122,8 @@
- Update stable-fast with support for torch 2.2.2 and 2.3.0, thanks @Aptronymist
- Add torch *cudaMallocAsync* in compute options
Can improve memory utilization on compatible GPUs (RTX and newer)
- Torch dynamic profiling
You can enable/disable full torch profiling in settings top menu on-the-fly
- Support controlnet manually downloads models in both standalone and diffusers format
For standalone, simply copy safetensors file to `models/control/controlnet` folder
For diffusers format, create folder with model name in `models/control/controlnet/`

View File

@ -153,7 +153,7 @@ class APIControl():
# run
with self.queue_lock:
shared.state.begin('api-control', api=True)
shared.state.begin('API-CTL', api=True)
output_images = []
output_processed = []
output_info = ''

View File

@ -104,7 +104,7 @@ class APIGenerate():
p.scripts = script_runner
p.outpath_grids = shared.opts.outdir_grids or shared.opts.outdir_txt2img_grids
p.outpath_samples = shared.opts.outdir_samples or shared.opts.outdir_txt2img_samples
shared.state.begin('api-txt2img', api=True)
shared.state.begin('API TXT', api=True)
script_args = script.init_script_args(p, txt2imgreq, self.default_script_arg_txt2img, selectable_scripts, selectable_script_idx, script_runner)
if selectable_scripts is not None:
processed = scripts.scripts_txt2img.run(p, *script_args) # Need to pass args as list here
@ -148,7 +148,7 @@ class APIGenerate():
p.scripts = script_runner
p.outpath_grids = shared.opts.outdir_img2img_grids
p.outpath_samples = shared.opts.outdir_img2img_samples
shared.state.begin('api-img2img', api=True)
shared.state.begin('API-IMG', api=True)
script_args = script.init_script_args(p, img2imgreq, self.default_script_arg_img2img, selectable_scripts, selectable_script_idx, script_runner)
if selectable_scripts is not None:
processed = scripts.scripts_img2img.run(p, *script_args) # Need to pass args as list here

View File

@ -64,7 +64,7 @@ class APIProcess():
for k, v in req.params.items():
if k not in processors.config[processor.processor_id]['params']:
return JSONResponse(status_code=400, content={"error": f"Processor invalid parameter: id={req.model} {k}={v}"})
shared.state.begin('api-preprocess', api=True)
shared.state.begin('API-PRE', api=True)
processed = processor(image, local_config=req.params)
image = encode_pil_to_base64(processed)
shared.state.end(api=False)
@ -90,7 +90,7 @@ class APIProcess():
return JSONResponse(status_code=400, content={"error": f"Mask invalid parameter: {k}={v}"})
else:
setattr(masking.opts, k, v)
shared.state.begin('api-mask', api=True)
shared.state.begin('API-MASK', api=True)
with self.queue_lock:
processed = masking.run_mask(input_image=image, input_mask=mask, return_type=req.type)
shared.state.end(api=False)

View File

@ -89,8 +89,18 @@ def profile(profiler, msg: str):
def profile_torch(profiler, msg: str):
profiler.stop()
lines = profiler.key_averages().table(sort_by="cpu_time_total", row_limit=12)
lines = lines.split('\n')
lines = [x for x in lines if '/profiler' not in x and '---' not in x]
txt = '\n'.join(lines)
log.debug(f'Torch profile CPU-total {msg}: \n{txt}')
lines = profiler.key_averages().table(sort_by="self_cpu_time_total", row_limit=12)
lines = lines.split('\n')
lines = [x for x in lines if '/profiler' not in x and '---' not in x]
txt = '\n'.join(lines)
log.debug(f'Torch profile {msg}: \n{txt}')
log.debug(f'Torch profile CPU-self {msg}: \n{txt}')
lines = profiler.key_averages().table(sort_by="cuda_time_total", row_limit=12)
lines = lines.split('\n')
lines = [x for x in lines if '/profiler' not in x and '---' not in x]
txt = '\n'.join(lines)
log.debug(f'Torch profile CUDA {msg}: \n{txt}')

View File

@ -54,7 +54,7 @@ def to_half(tensor, enable):
def run_modelmerger(id_task, **kwargs): # pylint: disable=unused-argument
shared.state.begin('merge')
shared.state.begin('Merge')
t0 = time.time()
def fail(message):
@ -284,7 +284,7 @@ def run_modelconvert(model, checkpoint_formats, precision, conv_type, custom_nam
"vae": vae_conv,
"other": others_conv
}
shared.state.begin('convert')
shared.state.begin('Convert')
model_info = sd_models.checkpoints_list[model]
shared.state.textinfo = f"Loading {model_info.filename}..."
shared.log.info(f"Model convert loading: {model_info.filename}")

View File

@ -69,7 +69,7 @@ def sha256(filename, title, use_addnet_hash=False):
if not os.path.isfile(filename):
return None
orig_state = copy.deepcopy(shared.state)
shared.state.begin("hash")
shared.state.begin("Hash")
if use_addnet_hash:
if progress_ok:
try:

View File

@ -163,7 +163,7 @@ class InterrogateModels:
def interrogate(self, pil_image):
res = ""
shared.state.begin('interrogate')
shared.state.begin('Interrogate')
try:
if shared.backend == shared.Backend.ORIGINAL and (shared.cmd_opts.lowvram or shared.cmd_opts.medvram):
lowvram.send_everything_to_cpu()
@ -267,8 +267,7 @@ def interrogate(image, mode, caption=None):
def interrogate_image(image, model, mode):
shared.state.begin()
shared.state.job = 'interrogate'
shared.state.begin('Interrogate')
try:
if shared.backend == shared.Backend.ORIGINAL and (shared.cmd_opts.lowvram or shared.cmd_opts.medvram):
lowvram.send_everything_to_cpu()
@ -295,8 +294,7 @@ def interrogate_batch(batch_files, batch_folder, batch_str, model, mode, write):
if len(files) == 0:
shared.log.error('Interrogate batch no images')
return ''
shared.state.begin()
shared.state.job = 'batch interrogate'
shared.state.begin('Batch interrogate')
prompts = []
try:
if shared.backend == shared.Backend.ORIGINAL and (shared.cmd_opts.lowvram or shared.cmd_opts.medvram):

View File

@ -45,7 +45,7 @@ def download_civit_preview(model_path: str, preview_url: str):
block_size = 16384 # 16KB blocks
written = 0
img = None
shared.state.begin('civitai')
shared.state.begin('CivitAI')
try:
with open(preview_file, 'wb') as f:
with p.Progress(p.TextColumn('[cyan]{task.description}'), p.DownloadColumn(), p.BarColumn(), p.TaskProgressColumn(), p.TimeRemainingColumn(), p.TimeElapsedColumn(), p.TransferSpeedColumn(), console=shared.console) as progress:
@ -107,7 +107,7 @@ def download_civit_model_thread(model_name, model_url, model_path, model_type, t
total_size = int(r.headers.get('content-length', 0))
res += f' size={round((starting_pos + total_size)/1024/1024, 2)}Mb'
shared.log.info(res)
shared.state.begin('civitai')
shared.state.begin('CivitAI')
block_size = 16384 # 16KB blocks
written = starting_pos
global download_pbar # pylint: disable=global-statement
@ -162,7 +162,7 @@ def download_diffusers_model(hub_id: str, cache_dir: str = None, download_config
return None
from diffusers import DiffusionPipeline
import huggingface_hub as hf
shared.state.begin('huggingface')
shared.state.begin('HuggingFace')
if download_config is None:
download_config = {
"force_download": False,

View File

@ -10,7 +10,7 @@ from modules.shared import opts
def run_postprocessing(extras_mode, image, image_folder: List[tempfile.NamedTemporaryFile], input_dir, output_dir, show_extras_results, *args, save_output: bool = True):
devices.torch_gc()
shared.state.begin('extras')
shared.state.begin('Extras')
image_data = []
image_names = []
image_fullnames = []

View File

@ -169,7 +169,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
p.ops.append('upscale')
if shared.opts.save and not p.do_not_save_samples and shared.opts.save_images_before_highres_fix and hasattr(shared.sd_model, 'vae'):
save_intermediate(p, latents=output.images, suffix="-before-hires")
shared.state.job = 'upscale'
shared.state.job = 'Upscale'
output.images = resize_hires(p, latents=output.images)
sd_hijack_hypertile.hypertile_set(p, hr=True)
@ -211,7 +211,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
desc='Hires',
)
update_sampler(p, shared.sd_model, second_pass=True)
shared.state.job = 'hires'
shared.state.job = 'HiRes'
shared.state.sampling_steps = hires_args.get('num_inference_steps', None) or p.steps
try:
sd_models_compile.check_deepcache(enable=True)
@ -230,7 +230,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
# optional refiner pass or decode
if is_refiner_enabled():
prev_job = shared.state.job
shared.state.job = 'refine'
shared.state.job = 'Refine'
shared.state.job_count +=1
if shared.opts.save and not p.do_not_save_samples and shared.opts.save_images_before_refiner and hasattr(shared.sd_model, 'vae'):
save_intermediate(p, latents=output.images, suffix="-before-refiner")

View File

@ -191,7 +191,7 @@ def decode_first_stage(model, x, full_quality=True):
x_sample = torch.zeros((len(x), 3, x.shape[2] * 8, x.shape[3] * 8), dtype=devices.dtype_vae, device=devices.device)
return x_sample
prev_job = shared.state.job
shared.state.job = 'vae'
shared.state.job = 'VAE'
with devices.autocast(disable = x.dtype==devices.dtype_vae):
try:
if full_quality:

View File

@ -97,7 +97,7 @@ def sample_txt2img(p: processing.StableDiffusionProcessingTxt2Img, conditioning,
p.extra_generation_params, p.restore_faces = bak_extra_generation_params, bak_restore_faces
images.save_image(image, p.outpath_samples, "", seeds[i], prompts[i], shared.opts.samples_format, info=info, suffix="-before-hires")
if latent_scale_mode is None or p.hr_force: # non-latent upscaling
shared.state.job = 'upscale'
shared.state.job = 'Upscale'
if decoded_samples is None:
decoded_samples = decode_first_stage(p.sd_model, samples.to(dtype=devices.dtype_vae), p.full_quality)
decoded_samples = torch.clamp((decoded_samples + 1.0) / 2.0, min=0.0, max=1.0)
@ -126,7 +126,7 @@ def sample_txt2img(p: processing.StableDiffusionProcessingTxt2Img, conditioning,
if p.hr_sampler_name == "PLMS":
p.hr_sampler_name = 'UniPC'
if p.hr_force or latent_scale_mode is not None:
shared.state.job = 'hires'
shared.state.job = 'HiRes'
if p.denoising_strength > 0:
p.ops.append('hires')
devices.torch_gc() # GC now before running the next img2img to prevent running out of memory

View File

@ -109,7 +109,7 @@ def taesd_vae_encode(image):
def vae_decode(latents, model, output_type='np', full_quality=True):
t0 = time.time()
prev_job = shared.state.job
shared.state.job = 'vae'
shared.state.job = 'VAE'
if not torch.is_tensor(latents): # already decoded
return latents
if latents.shape[0] == 0:

View File

@ -1487,7 +1487,7 @@ def reload_model_weights(sd_model=None, info=None, reuse_dict=False, op='model')
return None
orig_state = copy.deepcopy(shared.state)
shared.state = shared_state.State()
shared.state.begin('load')
shared.state.begin('Load')
if load_dict:
shared.log.debug(f'Model dict: existing={sd_model is not None} target={checkpoint_info.filename} info={info}')
else:

View File

@ -278,6 +278,7 @@ def create_ui(startup_timer = None):
shutdown_submit = gr.Button(value="Shutdown server", variant='primary', elem_id="shutdown_submit")
unload_sd_model = gr.Button(value='Unload checkpoint', variant='primary', elem_id="sett_unload_sd_model")
reload_sd_model = gr.Button(value='Reload checkpoint', variant='primary', elem_id="sett_reload_sd_model")
enable_profiling = gr.Button(value='Start profiling', variant='primary', elem_id="start_profiling")
with gr.Tabs(elem_id="system") as system_tabs:
global ui_system_tabs # pylint: disable=global-statement
@ -363,8 +364,13 @@ def create_ui(startup_timer = None):
def reload_sd_weights():
modules.sd_models.reload_model_weights()
def switch_profiling():
shared.cmd_opts.profile = not shared.cmd_opts.profile
shared.log.warning(f'Profiling: {shared.cmd_opts.profile}')
unload_sd_model.click(fn=unload_sd_weights, inputs=[], outputs=[])
reload_sd_model.click(fn=reload_sd_weights, inputs=[], outputs=[])
enable_profiling.click(fn=switch_profiling, inputs=[], outputs=[])
request_notifications.click(fn=lambda: None, inputs=[], outputs=[], _js='function(){}')
preview_theme.click(fn=None, _js='previewTheme', inputs=[], outputs=[])

View File

@ -46,7 +46,7 @@ def generate_click(job_id: str, active_tab: str, *args):
time.sleep(0.01)
from modules.control.run import control_run
debug(f'Control: tab="{active_tab}" job={job_id} args={args}')
shared.state.begin('control')
shared.state.begin('Generate')
progress.add_task_to_queue(job_id)
with call_queue.queue_lock:
yield [None, None, None, None, 'Control: starting']

View File

@ -95,7 +95,7 @@ class Upscaler:
def upscale(self, img: Image, scale, selected_model: str = None):
orig_state = copy.deepcopy(shared.state)
shared.state.begin('upscale')
shared.state.begin('Upscale')
self.scale = scale
dest_w = int(img.width * scale)
dest_h = int(img.height * scale)

View File

@ -159,7 +159,7 @@ def load_model():
if not opts.sd_checkpoint_autoload or (shared.cmd_opts.ckpt is not None and shared.cmd_opts.ckpt.lower() != 'none'):
log.debug('Model auto load disabled')
else:
shared.state.begin('load')
shared.state.begin('Load')
thread_model = Thread(target=lambda: shared.sd_model)
thread_model.start()
thread_refiner = Thread(target=lambda: shared.sd_refiner)