diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ac1bd607..2c248a1d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,14 +1,20 @@ # Change Log for SD.Next -## Update for 2025-02-20 +## Update for 2025-02-22 -Quick release refresh: -- remove ui splash screen on auth fail -- add `--extensions-dir` cli arg and `SD_EXTENSIONSDIR` env variable to specify extensions directory -- log full path when reading/saving `config.json` -- log full path to `sdnext.log` -- log system hostname in `sdnext.log` -- log extensions path in `sdnext.log` +- **Decode** + - Final step of image generate, VAE decode, is by far the most memory intensive operation and can easily result in out-of-memory errors + What can be done? Well, *Huggingface* is now providing *free-of-charge* **remote-VAE-decode** service! + - How to use? Previous *Full quality* option in UI is replace it with VAE type selector: Full, Tiny, Remote + Currently supports SD15, SDXL and FLUX.1 with more models expected in the near future + Availability is limited, so if remote processing fails SD.Next will fallback to using normal VAE decode process +- **Other** + - add `--extensions-dir` cli arg and `SD_EXTENSIONSDIR` env variable to specify extensions directory +- **Fixes** + - remove ui splash screen on auth fail + - log full config path, full log path, system name, extensions path + - zluda update + - fix zluda with pulid ## Update for 2025-02-18 diff --git a/cli/run-benchmark.py b/cli/run-benchmark.py index 2b09966de..ff5664fdc 100755 --- a/cli/run-benchmark.py +++ b/cli/run-benchmark.py @@ -134,7 +134,7 @@ if __name__ == '__main__': "sampler_name": args.sampler, "width": args.width, "height": args.height, - "full_quality": not args.taesd, + "vae_type": 'Tiny' if args.taesd else 'Full', "cfg_scale": 0, "batch_size": 1, "n_iter": 1, diff --git a/modules/control/run.py b/modules/control/run.py index c052bc4e8..d6dccbd75 100644 --- a/modules/control/run.py +++ b/modules/control/run.py @@ -228,7 +228,7 @@ def control_run(state: str = '', steps: int = 20, sampler_index: int = None, seed: int = -1, subseed: int = -1, subseed_strength: float = 0, seed_resize_from_h: int = -1, seed_resize_from_w: int = -1, cfg_scale: float = 6.0, clip_skip: float = 1.0, image_cfg_scale: float = 6.0, diffusers_guidance_rescale: float = 0.7, pag_scale: float = 0.0, pag_adaptive: float = 0.5, cfg_end: float = 1.0, - full_quality: bool = True, tiling: bool = False, hidiffusion: bool = False, + vae_type: str = 'Full', tiling: bool = False, hidiffusion: bool = False, detailer_enabled: bool = True, detailer_prompt: str = '', detailer_negative: str = '', detailer_steps: int = 10, detailer_strength: float = 0.3, hdr_mode: int = 0, hdr_brightness: float = 0, hdr_color: float = 0, hdr_sharpen: float = 0, hdr_clamp: bool = False, hdr_boundary: float = 4.0, hdr_threshold: float = 0.95, hdr_maximize: bool = False, hdr_max_center: float = 0.6, hdr_max_boundry: float = 1.0, hdr_color_picker: str = None, hdr_tint_ratio: float = 0, @@ -292,7 +292,7 @@ def control_run(state: str = '', diffusers_guidance_rescale = diffusers_guidance_rescale, pag_scale = pag_scale, pag_adaptive = pag_adaptive, - full_quality = full_quality, + vae_type = vae_type, tiling = tiling, hidiffusion = hidiffusion, # detailer diff --git a/modules/images_resize.py b/modules/images_resize.py index a549b5bf9..9e37d69cd 100644 --- a/modules/images_resize.py +++ b/modules/images_resize.py @@ -16,9 +16,9 @@ def resize_image(resize_mode: int, im: Union[Image.Image, torch.Tensor], width: return im else: from modules.processing_vae import vae_encode, vae_decode - latents = vae_encode(im, shared.sd_model, full_quality=False) # TODO resize image: enable full VAE mode for resize-latent + latents = vae_encode(im, shared.sd_model, vae_type='Tiny') # TODO resize image: enable full VAE mode for resize-latent latents = selected_upscaler.scaler.upscale(latents, scale, selected_upscaler.name) - im = vae_decode(latents, shared.sd_model, output_type='pil', full_quality=False)[0] + im = vae_decode(latents, shared.sd_model, output_type='pil', vae_type='Tiny')[0] return im def resize(im: Union[Image.Image, torch.Tensor], w, h): diff --git a/modules/img2img.py b/modules/img2img.py index 7a5a33cd0..75971c608 100644 --- a/modules/img2img.py +++ b/modules/img2img.py @@ -139,7 +139,7 @@ def img2img(id_task: str, state: str, mode: int, sampler_index, mask_blur, mask_alpha, inpainting_fill, - full_quality, tiling, hidiffusion, + vae_type, tiling, hidiffusion, detailer_enabled, detailer_prompt, detailer_negative, detailer_steps, detailer_strength, n_iter, batch_size, cfg_scale, image_cfg_scale, @@ -241,7 +241,7 @@ def img2img(id_task: str, state: str, mode: int, clip_skip=clip_skip, width=width, height=height, - full_quality=full_quality, + vae_type=vae_type, tiling=tiling, hidiffusion=hidiffusion, detailer_enabled=detailer_enabled, diff --git a/modules/infotext.py b/modules/infotext.py index e8788c2f5..574a732b3 100644 --- a/modules/infotext.py +++ b/modules/infotext.py @@ -105,7 +105,7 @@ def parse(infotext): elif val == "False": params[key] = False elif key == 'VAE' and val == 'TAESD': - params["Full quality"] = False + params["VAE type"] = 'Tiny' elif size is not None: params[f"{key}-1"] = int(size.group(1)) params[f"{key}-2"] = int(size.group(2)) diff --git a/modules/processing.py b/modules/processing.py index de7bd93de..02eee0206 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -151,7 +151,7 @@ def process_images(p: StableDiffusionProcessing) -> Processed: sd_models.reload_model_weights() if p.override_settings.get('sd_vae', None) is not None: if p.override_settings.get('sd_vae', None) == 'TAESD': - p.full_quality = False + p.vae_type = 'Tiny' p.override_settings.pop('sd_vae', None) if p.override_settings.get('Hires upscaler', None) is not None: p.enable_hr = True diff --git a/modules/processing_args.py b/modules/processing_args.py index d1ba840ea..ff693a03e 100644 --- a/modules/processing_args.py +++ b/modules/processing_args.py @@ -79,7 +79,7 @@ def task_specific_kwargs(p, model): } if model.__class__.__name__ == 'LatentConsistencyModelPipeline' and hasattr(p, 'init_images') and len(p.init_images) > 0: p.ops.append('lcm') - init_latents = [processing_vae.vae_encode(image, model=shared.sd_model, full_quality=p.full_quality).squeeze(dim=0) for image in p.init_images] + init_latents = [processing_vae.vae_encode(image, model=shared.sd_model, vae_type=p.vae_type).squeeze(dim=0) for image in p.init_images] init_latent = torch.stack(init_latents, dim=0).to(shared.device) init_noise = p.denoising_strength * processing.create_random_tensors(init_latent.shape[1:], seeds=p.all_seeds, subseeds=p.all_subseeds, subseed_strength=p.subseed_strength, p=p) init_latent = (1 - p.denoising_strength) * init_latent + init_noise diff --git a/modules/processing_class.py b/modules/processing_class.py index b3fa3e313..d86e3e5aa 100644 --- a/modules/processing_class.py +++ b/modules/processing_class.py @@ -48,7 +48,7 @@ class StableDiffusionProcessing: styles: List[str] = [], # vae tiling: bool = False, - full_quality: bool = True, + vae_type: str = 'Full', # other hidiffusion: bool = False, do_not_reload_embeddings: bool = False, @@ -169,7 +169,7 @@ class StableDiffusionProcessing: self.negative_prompt = negative_prompt self.styles = styles self.tiling = tiling - self.full_quality = full_quality + self.vae_type = vae_type self.hidiffusion = hidiffusion self.do_not_reload_embeddings = do_not_reload_embeddings self.detailer_enabled = detailer_enabled diff --git a/modules/processing_diffusers.py b/modules/processing_diffusers.py index 06d5f7eb2..acea34874 100644 --- a/modules/processing_diffusers.py +++ b/modules/processing_diffusers.py @@ -197,10 +197,10 @@ def process_hires(p: processing.StableDiffusionProcessing, output): if p.hr_force: shared.sd_model = sd_models.set_diffuser_pipe(shared.sd_model, sd_models.DiffusersTaskType.IMAGE_2_IMAGE) if 'Upscale' in shared.sd_model.__class__.__name__ or 'Flux' in shared.sd_model.__class__.__name__ or 'Kandinsky' in shared.sd_model.__class__.__name__: - output.images = processing_vae.vae_decode(latents=output.images, model=shared.sd_model, full_quality=p.full_quality, output_type='pil', width=p.width, height=p.height) + output.images = processing_vae.vae_decode(latents=output.images, model=shared.sd_model, vae_type=p.vae_type, output_type='pil', width=p.width, height=p.height) if p.is_control and hasattr(p, 'task_args') and p.task_args.get('image', None) is not None: if hasattr(shared.sd_model, "vae") and output.images is not None and len(output.images) > 0: - output.images = processing_vae.vae_decode(latents=output.images, model=shared.sd_model, full_quality=p.full_quality, output_type='pil', width=p.hr_upscale_to_x, height=p.hr_upscale_to_y) # controlnet cannnot deal with latent input + output.images = processing_vae.vae_decode(latents=output.images, model=shared.sd_model, vae_type=p.vae_type, output_type='pil', width=p.hr_upscale_to_x, height=p.hr_upscale_to_y) # controlnet cannnot deal with latent input update_sampler(p, shared.sd_model, second_pass=True) orig_denoise = p.denoising_strength p.denoising_strength = strength @@ -289,7 +289,7 @@ def process_refine(p: processing.StableDiffusionProcessing, output): noise_level = round(350 * p.denoising_strength) output_type='latent' if 'Upscale' in shared.sd_refiner.__class__.__name__ or 'Flux' in shared.sd_refiner.__class__.__name__ or 'Kandinsky' in shared.sd_refiner.__class__.__name__: - image = processing_vae.vae_decode(latents=image, model=shared.sd_model, full_quality=p.full_quality, output_type='pil', width=p.width, height=p.height) + image = processing_vae.vae_decode(latents=image, model=shared.sd_model, vae_type=p.vae_type, output_type='pil', width=p.width, height=p.height) p.extra_generation_params['Noise level'] = noise_level output_type = 'np' update_sampler(p, shared.sd_refiner, second_pass=True) @@ -370,7 +370,7 @@ def process_decode(p: processing.StableDiffusionProcessing, output): result_batch = processing_vae.vae_decode( latents = output.images[i], model = model, - full_quality = p.full_quality, + vae_type = p.vae_type, width = width, height = height, frames = frames, @@ -381,7 +381,7 @@ def process_decode(p: processing.StableDiffusionProcessing, output): results = processing_vae.vae_decode( latents = output.images, model = model, - full_quality = p.full_quality, + vae_type = p.vae_type, width = width, height = height, frames = frames, diff --git a/modules/processing_helpers.py b/modules/processing_helpers.py index cd8c1bb9a..a5abba6a2 100644 --- a/modules/processing_helpers.py +++ b/modules/processing_helpers.py @@ -201,7 +201,7 @@ def create_random_tensors(shape, seeds, subseeds=None, subseed_strength=0.0, see return x -def decode_first_stage(model, x, full_quality=True): +def decode_first_stage(model, x): if not shared.opts.keep_incomplete and (shared.state.skipped or shared.state.interrupted): shared.log.debug(f'Decode VAE: skipped={shared.state.skipped} interrupted={shared.state.interrupted}') x_sample = torch.zeros((len(x), 3, x.shape[2] * 8, x.shape[3] * 8), dtype=devices.dtype_vae, device=devices.device) @@ -210,20 +210,14 @@ def decode_first_stage(model, x, full_quality=True): shared.state.job = 'VAE' with devices.autocast(disable = x.dtype==devices.dtype_vae): try: - if full_quality: - if hasattr(model, 'decode_first_stage'): - # x_sample = model.decode_first_stage(x) * 0.5 + 0.5 - x_sample = model.decode_first_stage(x) - elif hasattr(model, 'vae'): - x_sample = processing_vae.vae_decode(latents=x, model=model, output_type='np', full_quality=full_quality) - else: - x_sample = x - shared.log.error('Decode VAE unknown model') + if hasattr(model, 'decode_first_stage'): + # x_sample = model.decode_first_stage(x) * 0.5 + 0.5 + x_sample = model.decode_first_stage(x) + elif hasattr(model, 'vae'): + x_sample = processing_vae.vae_decode(latents=x, model=model, output_type='np') else: - from modules import sd_vae_taesd - x_sample = torch.zeros((len(x), 3, x.shape[2] * 8, x.shape[3] * 8), dtype=devices.dtype_vae, device=devices.device) - for i in range(len(x_sample)): - x_sample[i] = sd_vae_taesd.decode(x[i]) * 0.5 + 0.5 + x_sample = x + shared.log.error('Decode VAE unknown model') except Exception as e: x_sample = x shared.log.error(f'Decode VAE: {e}') @@ -407,7 +401,7 @@ def resize_init_images(p): def resize_hires(p, latents): # input=latents output=pil if not latent_upscaler else latent if not torch.is_tensor(latents): shared.log.warning('Hires: input is not tensor') - first_pass_images = processing_vae.vae_decode(latents=latents, model=shared.sd_model, full_quality=p.full_quality, output_type='pil', width=p.width, height=p.height) + first_pass_images = processing_vae.vae_decode(latents=latents, model=shared.sd_model, vae_type=p.vae_type, output_type='pil', width=p.width, height=p.height) return first_pass_images if (p.hr_upscale_to_x == 0 or p.hr_upscale_to_y == 0) and hasattr(p, 'init_hr'): @@ -418,7 +412,7 @@ def resize_hires(p, latents): # input=latents output=pil if not latent_upscaler resized_image = images.resize_image(p.hr_resize_mode, latents, p.hr_upscale_to_x, p.hr_upscale_to_y, upscaler_name=p.hr_upscaler, context=p.hr_resize_context) return resized_image - first_pass_images = processing_vae.vae_decode(latents=latents, model=shared.sd_model, full_quality=p.full_quality, output_type='pil', width=p.width, height=p.height) + first_pass_images = processing_vae.vae_decode(latents=latents, model=shared.sd_model, vae_type=p.vae_type, output_type='pil', width=p.width, height=p.height) resized_images = [] for img in first_pass_images: resized_image = images.resize_image(p.hr_resize_mode, img, p.hr_upscale_to_x, p.hr_upscale_to_y, upscaler_name=p.hr_upscaler, context=p.hr_resize_context) @@ -561,7 +555,7 @@ def save_intermediate(p, latents, suffix): for i in range(len(latents)): from modules.processing import create_infotext info=create_infotext(p, p.all_prompts, p.all_seeds, p.all_subseeds, [], iteration=p.iteration, position_in_batch=i) - decoded = processing_vae.vae_decode(latents=latents, model=shared.sd_model, output_type='pil', full_quality=p.full_quality, width=p.width, height=p.height) + decoded = processing_vae.vae_decode(latents=latents, model=shared.sd_model, output_type='pil', vae_type=p.vae_type, width=p.width, height=p.height) for j in range(len(decoded)): images.save_image(decoded[j], path=p.outpath_samples, basename="", seed=p.seeds[i], prompt=p.prompts[i], extension=shared.opts.samples_format, info=info, p=p, suffix=suffix) diff --git a/modules/processing_info.py b/modules/processing_info.py index 7d07a04dd..d3c49a4a1 100644 --- a/modules/processing_info.py +++ b/modules/processing_info.py @@ -58,7 +58,6 @@ def create_infotext(p: StableDiffusionProcessing, all_prompts=None, all_seeds=No "Batch": f'{p.n_iter}x{p.batch_size}' if p.n_iter > 1 or p.batch_size > 1 else None, "Model": None if (not shared.opts.add_model_name_to_info) or (not shared.sd_model.sd_checkpoint_info.model_name) else shared.sd_model.sd_checkpoint_info.model_name.replace(',', '').replace(':', ''), "Model hash": getattr(p, 'sd_model_hash', None if (not shared.opts.add_model_hash_to_info) or (not shared.sd_model.sd_model_hash) else shared.sd_model.sd_model_hash), - "VAE": (None if not shared.opts.add_model_name_to_info or sd_vae.loaded_vae_file is None else os.path.splitext(os.path.basename(sd_vae.loaded_vae_file))[0]) if p.full_quality else 'TAESD', "Refiner prompt": p.refiner_prompt if len(p.refiner_prompt) > 0 else None, "Refiner negative": p.refiner_negative if len(p.refiner_negative) > 0 else None, "Styles": "; ".join(p.styles) if p.styles is not None and len(p.styles) > 0 else None, @@ -71,6 +70,10 @@ def create_infotext(p: StableDiffusionProcessing, all_prompts=None, all_seeds=No "Comment": comment, "Operations": '; '.join(ops).replace('"', '') if len(p.ops) > 0 else 'none', } + if p.vae_type == 'Full': + args["VAE"] = (None if not shared.opts.add_model_name_to_info or sd_vae.loaded_vae_file is None else os.path.splitext(os.path.basename(sd_vae.loaded_vae_file))[0]) + elif p.vae_type == 'Tiny': + args["VAE"] = 'TAESD' if shared.opts.add_model_name_to_info and getattr(shared.sd_model, 'sd_checkpoint_info', None) is not None: args["Model"] = shared.sd_model.sd_checkpoint_info.model_name.replace(',', '').replace(':', '') if shared.opts.add_model_hash_to_info and getattr(shared.sd_model, 'sd_model_hash', None) is not None: diff --git a/modules/processing_original.py b/modules/processing_original.py index 261fc8c13..efb8c4d2e 100644 --- a/modules/processing_original.py +++ b/modules/processing_original.py @@ -49,7 +49,7 @@ def process_original(p: processing.StableDiffusionProcessing): c = get_conds_with_caching(prompt_parser.get_multicond_learned_conditioning, p.prompts, p.steps * step_multiplier, cached_c) with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast(): samples_ddim = p.sample(conditioning=c, unconditional_conditioning=uc, seeds=p.seeds, subseeds=p.subseeds, subseed_strength=p.subseed_strength, prompts=p.prompts) - x_samples_ddim = [processing.decode_first_stage(p.sd_model, samples_ddim[i:i+1].to(dtype=devices.dtype_vae), p.full_quality)[0].cpu() for i in range(samples_ddim.size(0))] + x_samples_ddim = [processing.decode_first_stage(p.sd_model, samples_ddim[i:i+1].to(dtype=devices.dtype_vae))[0].cpu() for i in range(samples_ddim.size(0))] try: for x in x_samples_ddim: devices.test_for_nans(x, "vae") @@ -60,7 +60,7 @@ def process_original(p: processing.StableDiffusionProcessing): devices.dtype_vae = torch.bfloat16 vae_file, vae_source = sd_vae.resolve_vae(p.sd_model.sd_model_checkpoint) sd_vae.load_vae(p.sd_model, vae_file, vae_source) - x_samples_ddim = [processing.decode_first_stage(p.sd_model, samples_ddim[i:i+1].to(dtype=devices.dtype_vae), p.full_quality)[0].cpu() for i in range(samples_ddim.size(0))] + x_samples_ddim = [processing.decode_first_stage(p.sd_model, samples_ddim[i:i+1].to(dtype=devices.dtype_vae))[0].cpu() for i in range(samples_ddim.size(0))] for x in x_samples_ddim: devices.test_for_nans(x, "vae") else: @@ -90,7 +90,7 @@ def sample_txt2img(p: processing.StableDiffusionProcessingTxt2Img, conditioning, target_height = p.hr_upscale_to_y decoded_samples = None if shared.opts.samples_save and shared.opts.save_images_before_highres_fix and not p.do_not_save_samples: - decoded_samples = decode_first_stage(p.sd_model, samples.to(dtype=devices.dtype_vae), p.full_quality) + decoded_samples = decode_first_stage(p.sd_model, samples.to(dtype=devices.dtype_vae)) decoded_samples = torch.clamp((decoded_samples + 1.0) / 2.0, min=0.0, max=1.0) for i, x_sample in enumerate(decoded_samples): x_sample = validate_sample(x_sample) @@ -107,13 +107,13 @@ def sample_txt2img(p: processing.StableDiffusionProcessingTxt2Img, conditioning, shared.state.job = 'Upscale' samples = images.resize_image(1, samples, target_width, target_height, upscaler_name=p.hr_upscaler) if getattr(p, "inpainting_mask_weight", shared.opts.inpainting_mask_weight) < 1.0: - image_conditioning = img2img_image_conditioning(p, decode_first_stage(p.sd_model, samples.to(dtype=devices.dtype_vae), p.full_quality), samples) + image_conditioning = img2img_image_conditioning(p, decode_first_stage(p.sd_model, samples.to(dtype=devices.dtype_vae)), samples) else: image_conditioning = txt2img_image_conditioning(p, samples.to(dtype=devices.dtype_vae)) else: shared.state.job = 'Upscale' if decoded_samples is None: - decoded_samples = decode_first_stage(p.sd_model, samples.to(dtype=devices.dtype_vae), p.full_quality) + decoded_samples = decode_first_stage(p.sd_model, samples.to(dtype=devices.dtype_vae)) decoded_samples = torch.clamp((decoded_samples + 1.0) / 2.0, min=0.0, max=1.0) batch_images = [] for _i, x_sample in enumerate(decoded_samples): diff --git a/modules/processing_vae.py b/modules/processing_vae.py index b54d0e245..e791f11c6 100644 --- a/modules/processing_vae.py +++ b/modules/processing_vae.py @@ -17,9 +17,9 @@ def create_latents(image, p, dtype=None, device=None): if image is None: return image elif isinstance(image, Image.Image): - latents = vae_encode(image, model=shared.sd_model, full_quality=p.full_quality) + latents = vae_encode(image, model=shared.sd_model, vae_type=p.vae_type) elif isinstance(image, list): - latents = [vae_encode(i, model=shared.sd_model, full_quality=p.full_quality).squeeze(dim=0) for i in image] + latents = [vae_encode(i, model=shared.sd_model, vae_type=p.vae_type).squeeze(dim=0) for i in image] latents = torch.stack(latents, dim=0).to(shared.device) else: shared.log.warning(f'Latents: input type: {type(image)} {image}') @@ -230,7 +230,7 @@ def taesd_vae_encode(image): return encoded -def vae_decode(latents, model, output_type='np', full_quality=True, width=None, height=None, frames=None): +def vae_decode(latents, model, output_type='np', vae_type='Full', width=None, height=None, frames=None): t0 = time.time() model = model or shared.sd_model if not hasattr(model, 'vae') and hasattr(model, 'pipe'): @@ -238,6 +238,15 @@ def vae_decode(latents, model, output_type='np', full_quality=True, width=None, if latents is None or not torch.is_tensor(latents): # already decoded return latents prev_job = shared.state.job + + if vae_type == 'Remote': + shared.state.job = 'Remote VAE' + from modules.sd_vae_remote import remote_decode + images = remote_decode(latents=latents, width=width, height=height) + shared.state.job = prev_job + if images is not None and len(images) > 0: + return images + shared.state.job = 'VAE' if latents.shape[0] == 0: shared.log.error(f'VAE nothing to decode: {latents.shape}') @@ -261,7 +270,7 @@ def vae_decode(latents, model, output_type='np', full_quality=True, width=None, if latents.shape[-1] <= 4: # not a latent, likely an image decoded = latents.float().cpu().numpy() - elif full_quality and hasattr(model, "vae"): + elif vae_type == 'Full' and hasattr(model, "vae"): decoded = full_vae_decode(latents=latents, model=model) elif hasattr(model, "vqgan"): decoded = full_vqgan_decode(latents=latents, model=model) @@ -296,7 +305,7 @@ def vae_decode(latents, model, output_type='np', full_quality=True, width=None, return imgs -def vae_encode(image, model, full_quality=True): # pylint: disable=unused-variable +def vae_encode(image, model, vae_type='Full'): # pylint: disable=unused-variable if shared.state.interrupted or shared.state.skipped: return [] if not hasattr(model, 'vae') and hasattr(model, 'pipe'): @@ -305,7 +314,7 @@ def vae_encode(image, model, full_quality=True): # pylint: disable=unused-variab shared.log.error('VAE not found in model') return [] tensor = TF.to_tensor(image.convert("RGB")).unsqueeze(0).to(devices.device, devices.dtype_vae) - if full_quality: + if vae_type == 'Full': tensor = tensor * 2 - 1 latents = full_vae_encode(image=tensor, model=shared.sd_model) else: @@ -321,7 +330,7 @@ def reprocess(gallery): if latent is None or gallery is None: return None shared.log.info(f'Reprocessing: latent={latent.shape}') - reprocessed = vae_decode(latent, shared.sd_model, output_type='pil', full_quality=True) + reprocessed = vae_decode(latent, shared.sd_model, output_type='pil') outputs = [] for i0, i1 in zip(gallery, reprocessed): if isinstance(i1, np.ndarray): diff --git a/modules/sd_vae_remote.py b/modules/sd_vae_remote.py new file mode 100644 index 000000000..5306895ea --- /dev/null +++ b/modules/sd_vae_remote.py @@ -0,0 +1,50 @@ +import io +import time +import base64 +import torch +import requests +from PIL import Image +from safetensors.torch import _tobytes + + +hf_endpoints = { + 'sd': 'https://lqmfdhmzmy4dw51z.us-east-1.aws.endpoints.huggingface.cloud', + 'sdxl': 'https://m5fxqwyk0r3uu79o.us-east-1.aws.endpoints.huggingface.cloud', + 'f1': 'https://zy1z7fzxpgtltg06.us-east-1.aws.endpoints.huggingface.cloud', +} + + +def remote_decode(latents: torch.Tensor, width: int = 0, height: int = 0, model_type: str = None) -> Image.Image: + from modules import devices, shared, errors + images = [] + model_type = model_type or shared.sd_model_type + url = hf_endpoints.get(model_type, None) + if url is None: + shared.log.error(f'Decode: type="remote" type={model_type} unsuppported') + return images + t0 = time.time() + latents = latents.unsqueeze(0) if len(latents.shape) == 3 else latents + for i in range(latents.shape[0]): + try: + latent = latents[i].detach().clone().to(device=devices.cpu, dtype=devices.dtype).unsqueeze(0) + encoded = base64.b64encode(_tobytes(latent, "inputs")).decode("utf-8") + params = {"shape": list(latent.shape), "dtype": str(latent.dtype).split(".", maxsplit=1)[-1]} + if (model_type == 'f1') and (width > 0) and (height > 0): + params['width'] = width + params['height'] = height + response = requests.post( + url=url, + json={"inputs": encoded, "parameters": params}, + headers={"Content-Type": "application/json", "Accept": "image/jpeg"}, + timeout=60, + ) + if not response.ok: + shared.log.error(f'Decode: type="remote" model={model_type} code={response.status_code} {response.json()}') + else: + images.append(Image.open(io.BytesIO(response.content))) + except Exception as e: + shared.log.error(f'Decode: type="remote" model={model_type} {e}') + errors.display(e, 'VAE') + t1 = time.time() + shared.log.debug(f'Decode: type="remote" model={model_type} args={params} images={images} time={t1-t0:.3f}s') + return images diff --git a/modules/txt2img.py b/modules/txt2img.py index e2cf7af77..08184fe43 100644 --- a/modules/txt2img.py +++ b/modules/txt2img.py @@ -11,7 +11,7 @@ debug('Trace: PROCESS') def txt2img(id_task, state, prompt, negative_prompt, prompt_styles, steps, sampler_index, hr_sampler_index, - full_quality, tiling, hidiffusion, + vae_type, tiling, hidiffusion, detailer_enabled, detailer_prompt, detailer_negative, detailer_steps, detailer_strength, n_iter, batch_size, cfg_scale, image_cfg_scale, diffusers_guidance_rescale, pag_scale, pag_adaptive, cfg_end, @@ -64,7 +64,7 @@ def txt2img(id_task, state, clip_skip=clip_skip, width=width, height=height, - full_quality=full_quality, + vae_type=vae_type, detailer_enabled=detailer_enabled, detailer_prompt=detailer_prompt, detailer_negative=detailer_negative, diff --git a/modules/ui_control.py b/modules/ui_control.py index e4c0b4413..dd09ff663 100644 --- a/modules/ui_control.py +++ b/modules/ui_control.py @@ -161,7 +161,7 @@ def create_ui(_blocks: gr.Blocks=None): mask_controls = masking.create_segment_ui() - full_quality, tiling, hidiffusion, cfg_scale, clip_skip, image_cfg_scale, guidance_rescale, pag_scale, pag_adaptive, cfg_end = ui_sections.create_advanced_inputs('control') + vae_type, tiling, hidiffusion, cfg_scale, clip_skip, image_cfg_scale, guidance_rescale, pag_scale, pag_adaptive, cfg_end = ui_sections.create_advanced_inputs('control') hdr_mode, hdr_brightness, hdr_color, hdr_sharpen, hdr_clamp, hdr_boundary, hdr_threshold, hdr_maximize, hdr_max_center, hdr_max_boundry, hdr_color_picker, hdr_tint_ratio = ui_sections.create_correction_inputs('control') with gr.Accordion(open=False, label="Video", elem_id="control_video", elem_classes=["small-accordion"]): @@ -561,7 +561,7 @@ def create_ui(_blocks: gr.Blocks=None): prompt, negative, styles, steps, sampler_index, seed, subseed, subseed_strength, seed_resize_from_h, seed_resize_from_w, - cfg_scale, clip_skip, image_cfg_scale, guidance_rescale, pag_scale, pag_adaptive, cfg_end, full_quality, tiling, hidiffusion, + cfg_scale, clip_skip, image_cfg_scale, guidance_rescale, pag_scale, pag_adaptive, cfg_end, vae_type, tiling, hidiffusion, detailer_enabled, detailer_prompt, detailer_negative, detailer_steps, detailer_strength, hdr_mode, hdr_brightness, hdr_color, hdr_sharpen, hdr_clamp, hdr_boundary, hdr_threshold, hdr_maximize, hdr_max_center, hdr_max_boundry, hdr_color_picker, hdr_tint_ratio, resize_mode_before, resize_name_before, resize_context_before, width_before, height_before, scale_by_before, selected_scale_tab_before, @@ -646,7 +646,7 @@ def create_ui(_blocks: gr.Blocks=None): (image_cfg_scale, "Image CFG scale"), (image_cfg_scale, "Hires CFG scale"), (guidance_rescale, "CFG rescale"), - (full_quality, "Full quality"), + (vae_type, "VAE type"), (tiling, "Tiling"), (hidiffusion, "HiDiffusion"), # detailer diff --git a/modules/ui_img2img.py b/modules/ui_img2img.py index c8e45648d..e6de84f6e 100644 --- a/modules/ui_img2img.py +++ b/modules/ui_img2img.py @@ -131,7 +131,7 @@ def create_ui(): denoising_strength = gr.Slider(minimum=0.0, maximum=0.99, step=0.01, label='Denoising strength', value=0.30, elem_id="img2img_denoising_strength") refiner_start = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, label='Denoise start', value=0.0, elem_id="img2img_refiner_start") - full_quality, tiling, hidiffusion, cfg_scale, clip_skip, image_cfg_scale, diffusers_guidance_rescale, pag_scale, pag_adaptive, cfg_end = ui_sections.create_advanced_inputs('img2img') + vae_type, tiling, hidiffusion, cfg_scale, clip_skip, image_cfg_scale, diffusers_guidance_rescale, pag_scale, pag_adaptive, cfg_end = ui_sections.create_advanced_inputs('img2img') hdr_mode, hdr_brightness, hdr_color, hdr_sharpen, hdr_clamp, hdr_boundary, hdr_threshold, hdr_maximize, hdr_max_center, hdr_max_boundry, hdr_color_picker, hdr_tint_ratio = ui_sections.create_correction_inputs('img2img') enable_hr, hr_sampler_index, hr_denoising_strength, hr_resize_mode, hr_resize_context, hr_upscaler, hr_force, hr_second_pass_steps, hr_scale, hr_resize_x, hr_resize_y, refiner_steps, hr_refiner_start, refiner_prompt, refiner_negative = ui_sections.create_hires_inputs('txt2img') detailer_enabled, detailer_prompt, detailer_negative, detailer_steps, detailer_strength = shared.yolo.ui('img2img') @@ -175,7 +175,7 @@ def create_ui(): sampler_index, mask_blur, mask_alpha, inpainting_fill, - full_quality, tiling, hidiffusion, + vae_type, tiling, hidiffusion, detailer_enabled, detailer_prompt, detailer_negative, detailer_steps, detailer_strength, batch_count, batch_size, cfg_scale, image_cfg_scale, @@ -261,7 +261,7 @@ def create_ui(): (image_cfg_scale, "Hires CFG scale"), (clip_skip, "Clip skip"), (diffusers_guidance_rescale, "CFG rescale"), - (full_quality, "Full quality"), + (vae_type, "VAE type"), (tiling, "Tiling"), (hidiffusion, "HiDiffusion"), # detailer diff --git a/modules/ui_sections.py b/modules/ui_sections.py index d83a8a60b..815c17cb4 100644 --- a/modules/ui_sections.py +++ b/modules/ui_sections.py @@ -170,8 +170,9 @@ def create_advanced_inputs(tab, base=True): with gr.Accordion(open=False, label="Advanced", elem_id=f"{tab}_advanced", elem_classes=["small-accordion"]): with gr.Group(): with gr.Row(elem_id=f"{tab}_advanced_options"): - full_quality = gr.Checkbox(label='Full quality', value=True, elem_id=f"{tab}_full_quality") - tiling = gr.Checkbox(label='Tiling', value=False, elem_id=f"{tab}_tiling") + vae_type = gr.Dropdown(label='VAE type', choices=['Full', 'Tiny', 'Remote'], value='Full', elem_id=f"{tab}_vae_type") + with gr.Row(elem_id=f"{tab}_advanced_options"): + tiling = gr.Checkbox(label='Texture tiling', value=False, elem_id=f"{tab}_tiling") hidiffusion = gr.Checkbox(label='HiDiffusion', value=False, elem_id=f"{tab}_hidiffusion") if base: cfg_scale, cfg_end = create_cfg_inputs(tab) @@ -185,7 +186,7 @@ def create_advanced_inputs(tab, base=True): diffusers_pag_adaptive = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, label='Adaptive scaling', value=0.5, elem_id=f"{tab}_pag_adaptive", visible=shared.native) with gr.Row(): clip_skip = gr.Slider(label='CLIP skip', value=1, minimum=0, maximum=12, step=0.1, elem_id=f"{tab}_clip_skip", interactive=True) - return full_quality, tiling, hidiffusion, cfg_scale, clip_skip, image_cfg_scale, diffusers_guidance_rescale, diffusers_pag_scale, diffusers_pag_adaptive, cfg_end + return vae_type, tiling, hidiffusion, cfg_scale, clip_skip, image_cfg_scale, diffusers_guidance_rescale, diffusers_pag_scale, diffusers_pag_adaptive, cfg_end def create_correction_inputs(tab): diff --git a/modules/ui_txt2img.py b/modules/ui_txt2img.py index 64d2906f8..30f9ab5ad 100644 --- a/modules/ui_txt2img.py +++ b/modules/ui_txt2img.py @@ -44,7 +44,7 @@ def create_ui(): with gr.Accordion(open=False, label="Samplers", elem_classes=["small-accordion"], elem_id="txt2img_sampler_group"): ui_sections.create_sampler_options('txt2img') seed, reuse_seed, subseed, reuse_subseed, subseed_strength, seed_resize_from_h, seed_resize_from_w = ui_sections.create_seed_inputs('txt2img') - full_quality, tiling, hidiffusion, _cfg_scale, clip_skip, image_cfg_scale, diffusers_guidance_rescale, pag_scale, pag_adaptive, _cfg_end = ui_sections.create_advanced_inputs('txt2img', base=False) + vae_type, tiling, hidiffusion, _cfg_scale, clip_skip, image_cfg_scale, diffusers_guidance_rescale, pag_scale, pag_adaptive, _cfg_end = ui_sections.create_advanced_inputs('txt2img', base=False) hdr_mode, hdr_brightness, hdr_color, hdr_sharpen, hdr_clamp, hdr_boundary, hdr_threshold, hdr_maximize, hdr_max_center, hdr_max_boundry, hdr_color_picker, hdr_tint_ratio = ui_sections.create_correction_inputs('txt2img') enable_hr, hr_sampler_index, denoising_strength, hr_resize_mode, hr_resize_context, hr_upscaler, hr_force, hr_second_pass_steps, hr_scale, hr_resize_x, hr_resize_y, refiner_steps, refiner_start, refiner_prompt, refiner_negative = ui_sections.create_hires_inputs('txt2img') detailer_enabled, detailer_prompt, detailer_negative, detailer_steps, detailer_strength = shared.yolo.ui('txt2img') @@ -64,7 +64,7 @@ def create_ui(): dummy_component, state, txt2img_prompt, txt2img_negative_prompt, txt2img_prompt_styles, steps, sampler_index, hr_sampler_index, - full_quality, tiling, hidiffusion, + vae_type, tiling, hidiffusion, detailer_enabled, detailer_prompt, detailer_negative, detailer_steps, detailer_strength, batch_count, batch_size, cfg_scale, image_cfg_scale, diffusers_guidance_rescale, pag_scale, pag_adaptive, cfg_end, @@ -122,7 +122,7 @@ def create_ui(): (image_cfg_scale, "Image CFG scale"), (image_cfg_scale, "Hires CFG scale"), (diffusers_guidance_rescale, "CFG rescale"), - (full_quality, "Full quality"), + (vae_type, "VAE type"), (tiling, "Tiling"), (hidiffusion, "HiDiffusion"), # detailer diff --git a/wiki b/wiki index e45bfe41f..0ef2c8d1d 160000 --- a/wiki +++ b/wiki @@ -1 +1 @@ -Subproject commit e45bfe41f0e494d6d5145443f966ba47560702f5 +Subproject commit 0ef2c8d1d85ea6fb433b7ff8f8e22d295de082c0