mirror of https://github.com/vladmandic/automatic
torch dynamic profiling
parent
d63f35e298
commit
554e5c8224
|
|
@ -122,6 +122,8 @@
|
|||
- Update stable-fast with support for torch 2.2.2 and 2.3.0, thanks @Aptronymist
|
||||
- Add torch *cudaMallocAsync* in compute options
|
||||
Can improve memory utilization on compatible GPUs (RTX and newer)
|
||||
- Torch dynamic profiling
|
||||
You can enable/disable full torch profiling in settings top menu on-the-fly
|
||||
- Support controlnet manually downloads models in both standalone and diffusers format
|
||||
For standalone, simply copy safetensors file to `models/control/controlnet` folder
|
||||
For diffusers format, create folder with model name in `models/control/controlnet/`
|
||||
|
|
|
|||
|
|
@ -153,7 +153,7 @@ class APIControl():
|
|||
|
||||
# run
|
||||
with self.queue_lock:
|
||||
shared.state.begin('api-control', api=True)
|
||||
shared.state.begin('API-CTL', api=True)
|
||||
output_images = []
|
||||
output_processed = []
|
||||
output_info = ''
|
||||
|
|
|
|||
|
|
@ -104,7 +104,7 @@ class APIGenerate():
|
|||
p.scripts = script_runner
|
||||
p.outpath_grids = shared.opts.outdir_grids or shared.opts.outdir_txt2img_grids
|
||||
p.outpath_samples = shared.opts.outdir_samples or shared.opts.outdir_txt2img_samples
|
||||
shared.state.begin('api-txt2img', api=True)
|
||||
shared.state.begin('API TXT', api=True)
|
||||
script_args = script.init_script_args(p, txt2imgreq, self.default_script_arg_txt2img, selectable_scripts, selectable_script_idx, script_runner)
|
||||
if selectable_scripts is not None:
|
||||
processed = scripts.scripts_txt2img.run(p, *script_args) # Need to pass args as list here
|
||||
|
|
@ -148,7 +148,7 @@ class APIGenerate():
|
|||
p.scripts = script_runner
|
||||
p.outpath_grids = shared.opts.outdir_img2img_grids
|
||||
p.outpath_samples = shared.opts.outdir_img2img_samples
|
||||
shared.state.begin('api-img2img', api=True)
|
||||
shared.state.begin('API-IMG', api=True)
|
||||
script_args = script.init_script_args(p, img2imgreq, self.default_script_arg_img2img, selectable_scripts, selectable_script_idx, script_runner)
|
||||
if selectable_scripts is not None:
|
||||
processed = scripts.scripts_img2img.run(p, *script_args) # Need to pass args as list here
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ class APIProcess():
|
|||
for k, v in req.params.items():
|
||||
if k not in processors.config[processor.processor_id]['params']:
|
||||
return JSONResponse(status_code=400, content={"error": f"Processor invalid parameter: id={req.model} {k}={v}"})
|
||||
shared.state.begin('api-preprocess', api=True)
|
||||
shared.state.begin('API-PRE', api=True)
|
||||
processed = processor(image, local_config=req.params)
|
||||
image = encode_pil_to_base64(processed)
|
||||
shared.state.end(api=False)
|
||||
|
|
@ -90,7 +90,7 @@ class APIProcess():
|
|||
return JSONResponse(status_code=400, content={"error": f"Mask invalid parameter: {k}={v}"})
|
||||
else:
|
||||
setattr(masking.opts, k, v)
|
||||
shared.state.begin('api-mask', api=True)
|
||||
shared.state.begin('API-MASK', api=True)
|
||||
with self.queue_lock:
|
||||
processed = masking.run_mask(input_image=image, input_mask=mask, return_type=req.type)
|
||||
shared.state.end(api=False)
|
||||
|
|
|
|||
|
|
@ -89,8 +89,18 @@ def profile(profiler, msg: str):
|
|||
|
||||
def profile_torch(profiler, msg: str):
|
||||
profiler.stop()
|
||||
lines = profiler.key_averages().table(sort_by="cpu_time_total", row_limit=12)
|
||||
lines = lines.split('\n')
|
||||
lines = [x for x in lines if '/profiler' not in x and '---' not in x]
|
||||
txt = '\n'.join(lines)
|
||||
log.debug(f'Torch profile CPU-total {msg}: \n{txt}')
|
||||
lines = profiler.key_averages().table(sort_by="self_cpu_time_total", row_limit=12)
|
||||
lines = lines.split('\n')
|
||||
lines = [x for x in lines if '/profiler' not in x and '---' not in x]
|
||||
txt = '\n'.join(lines)
|
||||
log.debug(f'Torch profile {msg}: \n{txt}')
|
||||
log.debug(f'Torch profile CPU-self {msg}: \n{txt}')
|
||||
lines = profiler.key_averages().table(sort_by="cuda_time_total", row_limit=12)
|
||||
lines = lines.split('\n')
|
||||
lines = [x for x in lines if '/profiler' not in x and '---' not in x]
|
||||
txt = '\n'.join(lines)
|
||||
log.debug(f'Torch profile CUDA {msg}: \n{txt}')
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ def to_half(tensor, enable):
|
|||
|
||||
|
||||
def run_modelmerger(id_task, **kwargs): # pylint: disable=unused-argument
|
||||
shared.state.begin('merge')
|
||||
shared.state.begin('Merge')
|
||||
t0 = time.time()
|
||||
|
||||
def fail(message):
|
||||
|
|
@ -284,7 +284,7 @@ def run_modelconvert(model, checkpoint_formats, precision, conv_type, custom_nam
|
|||
"vae": vae_conv,
|
||||
"other": others_conv
|
||||
}
|
||||
shared.state.begin('convert')
|
||||
shared.state.begin('Convert')
|
||||
model_info = sd_models.checkpoints_list[model]
|
||||
shared.state.textinfo = f"Loading {model_info.filename}..."
|
||||
shared.log.info(f"Model convert loading: {model_info.filename}")
|
||||
|
|
|
|||
|
|
@ -69,7 +69,7 @@ def sha256(filename, title, use_addnet_hash=False):
|
|||
if not os.path.isfile(filename):
|
||||
return None
|
||||
orig_state = copy.deepcopy(shared.state)
|
||||
shared.state.begin("hash")
|
||||
shared.state.begin("Hash")
|
||||
if use_addnet_hash:
|
||||
if progress_ok:
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -163,7 +163,7 @@ class InterrogateModels:
|
|||
|
||||
def interrogate(self, pil_image):
|
||||
res = ""
|
||||
shared.state.begin('interrogate')
|
||||
shared.state.begin('Interrogate')
|
||||
try:
|
||||
if shared.backend == shared.Backend.ORIGINAL and (shared.cmd_opts.lowvram or shared.cmd_opts.medvram):
|
||||
lowvram.send_everything_to_cpu()
|
||||
|
|
@ -267,8 +267,7 @@ def interrogate(image, mode, caption=None):
|
|||
|
||||
|
||||
def interrogate_image(image, model, mode):
|
||||
shared.state.begin()
|
||||
shared.state.job = 'interrogate'
|
||||
shared.state.begin('Interrogate')
|
||||
try:
|
||||
if shared.backend == shared.Backend.ORIGINAL and (shared.cmd_opts.lowvram or shared.cmd_opts.medvram):
|
||||
lowvram.send_everything_to_cpu()
|
||||
|
|
@ -295,8 +294,7 @@ def interrogate_batch(batch_files, batch_folder, batch_str, model, mode, write):
|
|||
if len(files) == 0:
|
||||
shared.log.error('Interrogate batch no images')
|
||||
return ''
|
||||
shared.state.begin()
|
||||
shared.state.job = 'batch interrogate'
|
||||
shared.state.begin('Batch interrogate')
|
||||
prompts = []
|
||||
try:
|
||||
if shared.backend == shared.Backend.ORIGINAL and (shared.cmd_opts.lowvram or shared.cmd_opts.medvram):
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ def download_civit_preview(model_path: str, preview_url: str):
|
|||
block_size = 16384 # 16KB blocks
|
||||
written = 0
|
||||
img = None
|
||||
shared.state.begin('civitai')
|
||||
shared.state.begin('CivitAI')
|
||||
try:
|
||||
with open(preview_file, 'wb') as f:
|
||||
with p.Progress(p.TextColumn('[cyan]{task.description}'), p.DownloadColumn(), p.BarColumn(), p.TaskProgressColumn(), p.TimeRemainingColumn(), p.TimeElapsedColumn(), p.TransferSpeedColumn(), console=shared.console) as progress:
|
||||
|
|
@ -107,7 +107,7 @@ def download_civit_model_thread(model_name, model_url, model_path, model_type, t
|
|||
total_size = int(r.headers.get('content-length', 0))
|
||||
res += f' size={round((starting_pos + total_size)/1024/1024, 2)}Mb'
|
||||
shared.log.info(res)
|
||||
shared.state.begin('civitai')
|
||||
shared.state.begin('CivitAI')
|
||||
block_size = 16384 # 16KB blocks
|
||||
written = starting_pos
|
||||
global download_pbar # pylint: disable=global-statement
|
||||
|
|
@ -162,7 +162,7 @@ def download_diffusers_model(hub_id: str, cache_dir: str = None, download_config
|
|||
return None
|
||||
from diffusers import DiffusionPipeline
|
||||
import huggingface_hub as hf
|
||||
shared.state.begin('huggingface')
|
||||
shared.state.begin('HuggingFace')
|
||||
if download_config is None:
|
||||
download_config = {
|
||||
"force_download": False,
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ from modules.shared import opts
|
|||
|
||||
def run_postprocessing(extras_mode, image, image_folder: List[tempfile.NamedTemporaryFile], input_dir, output_dir, show_extras_results, *args, save_output: bool = True):
|
||||
devices.torch_gc()
|
||||
shared.state.begin('extras')
|
||||
shared.state.begin('Extras')
|
||||
image_data = []
|
||||
image_names = []
|
||||
image_fullnames = []
|
||||
|
|
|
|||
|
|
@ -169,7 +169,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
|
|||
p.ops.append('upscale')
|
||||
if shared.opts.save and not p.do_not_save_samples and shared.opts.save_images_before_highres_fix and hasattr(shared.sd_model, 'vae'):
|
||||
save_intermediate(p, latents=output.images, suffix="-before-hires")
|
||||
shared.state.job = 'upscale'
|
||||
shared.state.job = 'Upscale'
|
||||
output.images = resize_hires(p, latents=output.images)
|
||||
sd_hijack_hypertile.hypertile_set(p, hr=True)
|
||||
|
||||
|
|
@ -211,7 +211,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
|
|||
desc='Hires',
|
||||
)
|
||||
update_sampler(p, shared.sd_model, second_pass=True)
|
||||
shared.state.job = 'hires'
|
||||
shared.state.job = 'HiRes'
|
||||
shared.state.sampling_steps = hires_args.get('num_inference_steps', None) or p.steps
|
||||
try:
|
||||
sd_models_compile.check_deepcache(enable=True)
|
||||
|
|
@ -230,7 +230,7 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
|
|||
# optional refiner pass or decode
|
||||
if is_refiner_enabled():
|
||||
prev_job = shared.state.job
|
||||
shared.state.job = 'refine'
|
||||
shared.state.job = 'Refine'
|
||||
shared.state.job_count +=1
|
||||
if shared.opts.save and not p.do_not_save_samples and shared.opts.save_images_before_refiner and hasattr(shared.sd_model, 'vae'):
|
||||
save_intermediate(p, latents=output.images, suffix="-before-refiner")
|
||||
|
|
|
|||
|
|
@ -191,7 +191,7 @@ def decode_first_stage(model, x, full_quality=True):
|
|||
x_sample = torch.zeros((len(x), 3, x.shape[2] * 8, x.shape[3] * 8), dtype=devices.dtype_vae, device=devices.device)
|
||||
return x_sample
|
||||
prev_job = shared.state.job
|
||||
shared.state.job = 'vae'
|
||||
shared.state.job = 'VAE'
|
||||
with devices.autocast(disable = x.dtype==devices.dtype_vae):
|
||||
try:
|
||||
if full_quality:
|
||||
|
|
|
|||
|
|
@ -97,7 +97,7 @@ def sample_txt2img(p: processing.StableDiffusionProcessingTxt2Img, conditioning,
|
|||
p.extra_generation_params, p.restore_faces = bak_extra_generation_params, bak_restore_faces
|
||||
images.save_image(image, p.outpath_samples, "", seeds[i], prompts[i], shared.opts.samples_format, info=info, suffix="-before-hires")
|
||||
if latent_scale_mode is None or p.hr_force: # non-latent upscaling
|
||||
shared.state.job = 'upscale'
|
||||
shared.state.job = 'Upscale'
|
||||
if decoded_samples is None:
|
||||
decoded_samples = decode_first_stage(p.sd_model, samples.to(dtype=devices.dtype_vae), p.full_quality)
|
||||
decoded_samples = torch.clamp((decoded_samples + 1.0) / 2.0, min=0.0, max=1.0)
|
||||
|
|
@ -126,7 +126,7 @@ def sample_txt2img(p: processing.StableDiffusionProcessingTxt2Img, conditioning,
|
|||
if p.hr_sampler_name == "PLMS":
|
||||
p.hr_sampler_name = 'UniPC'
|
||||
if p.hr_force or latent_scale_mode is not None:
|
||||
shared.state.job = 'hires'
|
||||
shared.state.job = 'HiRes'
|
||||
if p.denoising_strength > 0:
|
||||
p.ops.append('hires')
|
||||
devices.torch_gc() # GC now before running the next img2img to prevent running out of memory
|
||||
|
|
|
|||
|
|
@ -109,7 +109,7 @@ def taesd_vae_encode(image):
|
|||
def vae_decode(latents, model, output_type='np', full_quality=True):
|
||||
t0 = time.time()
|
||||
prev_job = shared.state.job
|
||||
shared.state.job = 'vae'
|
||||
shared.state.job = 'VAE'
|
||||
if not torch.is_tensor(latents): # already decoded
|
||||
return latents
|
||||
if latents.shape[0] == 0:
|
||||
|
|
|
|||
|
|
@ -1487,7 +1487,7 @@ def reload_model_weights(sd_model=None, info=None, reuse_dict=False, op='model')
|
|||
return None
|
||||
orig_state = copy.deepcopy(shared.state)
|
||||
shared.state = shared_state.State()
|
||||
shared.state.begin('load')
|
||||
shared.state.begin('Load')
|
||||
if load_dict:
|
||||
shared.log.debug(f'Model dict: existing={sd_model is not None} target={checkpoint_info.filename} info={info}')
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -278,6 +278,7 @@ def create_ui(startup_timer = None):
|
|||
shutdown_submit = gr.Button(value="Shutdown server", variant='primary', elem_id="shutdown_submit")
|
||||
unload_sd_model = gr.Button(value='Unload checkpoint', variant='primary', elem_id="sett_unload_sd_model")
|
||||
reload_sd_model = gr.Button(value='Reload checkpoint', variant='primary', elem_id="sett_reload_sd_model")
|
||||
enable_profiling = gr.Button(value='Start profiling', variant='primary', elem_id="start_profiling")
|
||||
|
||||
with gr.Tabs(elem_id="system") as system_tabs:
|
||||
global ui_system_tabs # pylint: disable=global-statement
|
||||
|
|
@ -363,8 +364,13 @@ def create_ui(startup_timer = None):
|
|||
def reload_sd_weights():
|
||||
modules.sd_models.reload_model_weights()
|
||||
|
||||
def switch_profiling():
|
||||
shared.cmd_opts.profile = not shared.cmd_opts.profile
|
||||
shared.log.warning(f'Profiling: {shared.cmd_opts.profile}')
|
||||
|
||||
unload_sd_model.click(fn=unload_sd_weights, inputs=[], outputs=[])
|
||||
reload_sd_model.click(fn=reload_sd_weights, inputs=[], outputs=[])
|
||||
enable_profiling.click(fn=switch_profiling, inputs=[], outputs=[])
|
||||
request_notifications.click(fn=lambda: None, inputs=[], outputs=[], _js='function(){}')
|
||||
preview_theme.click(fn=None, _js='previewTheme', inputs=[], outputs=[])
|
||||
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ def generate_click(job_id: str, active_tab: str, *args):
|
|||
time.sleep(0.01)
|
||||
from modules.control.run import control_run
|
||||
debug(f'Control: tab="{active_tab}" job={job_id} args={args}')
|
||||
shared.state.begin('control')
|
||||
shared.state.begin('Generate')
|
||||
progress.add_task_to_queue(job_id)
|
||||
with call_queue.queue_lock:
|
||||
yield [None, None, None, None, 'Control: starting']
|
||||
|
|
|
|||
|
|
@ -95,7 +95,7 @@ class Upscaler:
|
|||
|
||||
def upscale(self, img: Image, scale, selected_model: str = None):
|
||||
orig_state = copy.deepcopy(shared.state)
|
||||
shared.state.begin('upscale')
|
||||
shared.state.begin('Upscale')
|
||||
self.scale = scale
|
||||
dest_w = int(img.width * scale)
|
||||
dest_h = int(img.height * scale)
|
||||
|
|
|
|||
2
webui.py
2
webui.py
|
|
@ -159,7 +159,7 @@ def load_model():
|
|||
if not opts.sd_checkpoint_autoload or (shared.cmd_opts.ckpt is not None and shared.cmd_opts.ckpt.lower() != 'none'):
|
||||
log.debug('Model auto load disabled')
|
||||
else:
|
||||
shared.state.begin('load')
|
||||
shared.state.begin('Load')
|
||||
thread_model = Thread(target=lambda: shared.sd_model)
|
||||
thread_model.start()
|
||||
thread_refiner = Thread(target=lambda: shared.sd_refiner)
|
||||
|
|
|
|||
Loading…
Reference in New Issue