import math import torch from tqdm import tqdm from modules import devices, shared, prompt_parser from modules.shared import state from modules.processing import opt_f, StableDiffusionProcessing from tile_utils.typing import * from tile_utils.utils import * class TiledDiffusion: def __init__(self, p:StableDiffusionProcessing, sampler:Sampler): self.method = self.__class__.__name__ self.p = p self.pbar = None # sampler self.sampler_name = p.sampler_name self.sampler_raw = sampler if self.is_kdiff: self.sampler: CFGDenoiser = sampler.model_wrap_cfg else: self.sampler: VanillaStableDiffusionSampler = sampler # fix. Kdiff 'AND' support and image editing model support if self.is_kdiff and not hasattr(self, 'is_edit_model'): self.is_edit_model = (shared.sd_model.cond_stage_key == "edit" # "txt" and self.sampler.image_cfg_scale is not None and self.sampler.image_cfg_scale != 1.0) # cache. final result of current sampling step, [B, C=4, H//8, W//8] # avoiding overhead of creating new tensors and weight summing self.x_buffer: Tensor = None # FIXME: I'm trying to count the step correctly but it's not working self.step_count = 0 self.inner_loop_count = 0 self.kdiff_step = -1 # weights for background & grid bboxes self.w: int = int(self.p.width // opt_f) self.h: int = int(self.p.height // opt_f) self.weights: Tensor = torch.zeros((1, 1, self.h, self.w), device=devices.device, dtype=torch.float32) # ext. Grid tiling painting (grid bbox) self.enable_grid_bbox: bool = False self.tile_w: int = None self.tile_h: int = None self.num_batches: int = None self.batched_bboxes: List[List[BBox]] = [] # ext. Region Prompt Control (custom bbox) self.enable_custom_bbox: bool = False self.custom_bboxes: List[CustomBBox] = [] self.cond_basis: Cond = None self.uncond_basis: Uncond = None self.draw_background: bool = True # by default we draw major prompts in grid tiles self.causal_layers: bool = None # ext. ControlNet self.enable_controlnet: bool = False self.controlnet_script: Any = None self.control_tensor_batch: Any = None self.control_params: Any = None self.control_tensor_cpu: bool = None self.control_tensor_custom: List = [] @property def is_kdiff(self): return isinstance(self.sampler_raw, KDiffusionSampler) @property def is_ddim(self): return isinstance(self.sampler_raw, VanillaStableDiffusionSampler) def update_pbar(self): if self.pbar.n >= self.pbar.total: self.pbar.close() else: if self.step_count == state.sampling_step: self.inner_loop_count += 1 if self.inner_loop_count < self.total_bboxes: self.pbar.update() else: self.step_count = state.sampling_step self.inner_loop_count = 0 def reset_buffer(self, x_in:Tensor): if self.x_buffer is None: self.x_buffer = torch.zeros_like(x_in, device=x_in.device, dtype=x_in.dtype) else: self.x_buffer.zero_() def init_done(self): ''' Call this after all `init_*`, settings are done, now perform: - settings sanity check - pre-computations, cache init - anything thing needed before denoising starts ''' self.total_bboxes = (self.num_batches if self.draw_background else 0) + len(self.custom_bboxes) assert self.total_bboxes > 0, "Nothing to paint! No background to draw and no custom bboxes were provided." self.pbar = tqdm(total=(self.total_bboxes) * state.sampling_steps, desc=f"{self.method} Sampling: ") ''' ↓↓↓ extensive functionality ↓↓↓ ''' @grid_bbox def init_grid_bbox(self, tile_w:int, tile_h:int, overlap:int, tile_bs:int): self.enable_grid_bbox = True self.tile_w = min(tile_w, self.w) self.tile_h = min(tile_h, self.h) overlap = max(0, min(overlap, min(tile_w, tile_h) - 4)) # split the latent into overlapped tiles, then batching # weights basically indicate how many times a pixel is painted bboxes, weights = split_bboxes(self.w, self.h, self.tile_w, self.tile_h, overlap, self.get_tile_weights()) self.weights += weights self.num_batches = math.ceil(len(bboxes) / tile_bs) BS = math.ceil(len(bboxes) / self.num_batches) # optimal_batch_size self.batched_bboxes = [bboxes[i*BS:(i+1)*BS] for i in range(self.num_batches)] @grid_bbox def get_tile_weights(self) -> Union[Tensor, float]: return 1.0 @custom_bbox def init_custom_bbox(self, bbox_control_states:BBoxControls, draw_background:bool, causal_layers:bool): self.enable_custom_bbox = True self.causal_layers = causal_layers self.draw_background = draw_background if not draw_background and self.weights is not None: self.weights.zero_() # The number parameters needed to initialize the CustomBBox is 9 currently. # Need to be the same as the number of parameters in the `bbox_control_states` list. n_controls = 9 self.custom_bboxes: List[CustomBBox] = [] for i in range(0, len(bbox_control_states), n_controls): e, x, y, w, h, p, n, blend_mode, feather_ratio = bbox_control_states[i:i+n_controls] if not e or x > 1.0 or y > 1.0 or w <= 0.0 or h <= 0.0: continue x = int(x * self.w) y = int(y * self.h) w = math.ceil(w * self.w) h = math.ceil(h * self.h) x = max(0, x) y = max(0, y) w = min(self.w - x, w) h = min(self.h - y, h) self.custom_bboxes.append(CustomBBox(x, y, w, h, p, n, blend_mode, feather_ratio)) if len(self.custom_bboxes) == 0: return # prepare cond p = self.p prompts = p.all_prompts[:p.batch_size] neg_prompts = p.all_negative_prompts[:p.batch_size] for bbox in self.custom_bboxes: bbox.cond, bbox.extra_network_data = Condition.get_cond(Prompt.append_prompt(prompts, bbox.prompt), p.steps, p.styles) bbox.uncond = Condition.get_uncond(Prompt.append_prompt(neg_prompts, bbox.neg_prompt), p.steps, p.styles) self.cond_basis, _ = Condition.get_cond(prompts, p.steps) self.uncond_basis = Condition.get_uncond(neg_prompts, p.steps) @custom_bbox def reconstruct_custom_cond(self, org_cond, custom_cond, custom_uncond, bbox): image_conditioning = None if isinstance(org_cond, dict): image_cond = org_cond['c_concat'][0] if image_cond.shape[2] == self.h and image_cond.shape[3] == self.w: image_cond = image_cond[:, :, bbox[1]:bbox[3], bbox[0]:bbox[2]] image_conditioning = image_cond conds_list, tensor = prompt_parser.reconstruct_multicond_batch(custom_cond, self.sampler.step) custom_uncond = prompt_parser.reconstruct_cond_batch(custom_uncond, self.sampler.step) return conds_list, tensor, custom_uncond, image_conditioning @custom_bbox def kdiff_custom_forward(self, x_tile:Tensor, sigma_in:Tensor, original_cond:CondDict, bbox_id:int, bbox:CustomBBox, forward_func, ): ''' draw custom bbox ''' ''' # The inner kdiff noise prediction is usually batched. # We need to unwrap the inside loop to simulate the batched behavior. # This can be extremely tricky. ''' # x_tile: [1, 4, 13, 15] # original_cond: {'c_crossattn': Tensor[1, 77, 768], 'c_concat': Tensor[1, 5, 1, 1]} # custom_cond: MulticondLearnedConditioning # uncond: Tensor[1, 231, 768] # bbox: CustomBBox # sigma_in: Tensor[1] # forward_func: CFGDenoiser.forward if self.kdiff_step != self.sampler.step: self.kdiff_step = self.sampler.step self.kdiff_step_bbox = [-1 for _ in range(len(self.custom_bboxes))] self.tensor = {} # {int: Tensor[cond]} self.uncond = {} # {int: Tensor[cond]} self.image_cond_in = {} # Initialize global prompts just for estimate the behavior of kdiff self.real_tensor = Condition.reconstruct_cond(self.cond_basis, self.sampler.step) self.real_uncond = Condition.reconstruct_uncond(self.uncond_basis, self.sampler.step) # reset the progress for all bboxes self.a = [0 for _ in range(len(self.custom_bboxes))] if self.kdiff_step_bbox[bbox_id] != self.sampler.step: # When a new step starts for a bbox, we need to judge whether the tensor is batched. self.kdiff_step_bbox[bbox_id] = self.sampler.step _, tensor, uncond, image_cond_in = self.reconstruct_custom_cond(original_cond, bbox.cond, bbox.uncond, bbox) if self.real_tensor.shape[1] == self.real_uncond.shape[1]: if shared.batch_cond_uncond: # when the real tensor is with equal length, all information is contained in x_tile. # we simulate the batched behavior and compute all the tensors in one go. if tensor.shape[1] == uncond.shape[1]: # When our prompt tensor is with equal length, we can directly their code. if not self.is_edit_model: cond = torch.cat([tensor, uncond]) else: cond = torch.cat([tensor, uncond, uncond]) self.set_controlnet_tensors(bbox_id, x_tile.shape[0]) return forward_func(x_tile, sigma_in, cond={"c_crossattn": [cond], "c_concat": [image_cond_in]}) else: # When not, we need to pass the tensor to UNet separately. x_out = torch.zeros_like(x_tile) cond_size = tensor.shape[0] self.set_controlnet_tensors(bbox_id, cond_size) cond_out = forward_func( x_tile [:cond_size], sigma_in[:cond_size], cond={ "c_crossattn": [tensor], "c_concat": [image_cond_in[:cond_size]] }) uncond_size = uncond.shape[0] self.set_controlnet_tensors(bbox_id, uncond_size) uncond_out = forward_func( x_tile [cond_size:cond_size+uncond_size], sigma_in[cond_size:cond_size+uncond_size], cond={ "c_crossattn": [uncond], "c_concat": [image_cond_in[cond_size:cond_size+uncond_size]] }) x_out[:cond_size] = cond_out x_out[cond_size:cond_size+uncond_size] = uncond_out if self.is_edit_model: x_out[cond_size+uncond_size:] = uncond_out return x_out # otherwise, the x_tile is only a partial batch. # We have to denoise in different runs. # We store the prompt and neg_prompt tensors for current bbox self.tensor[bbox_id] = tensor self.uncond[bbox_id] = uncond self.image_cond_in[bbox_id] = image_cond_in # Now we get current batch of prompt and neg_prompt tensors tensor = self.tensor[bbox_id] uncond = self.uncond[bbox_id] batch_size = x_tile.shape[0] # get the start and end index of the current batch a = self.a[bbox_id] b = a + batch_size self.a[bbox_id] += batch_size if self.real_tensor.shape[1] == self.real_uncond.shape[1]: # When use --lowvram or --medvram, kdiff will slice the cond and uncond with [a:b] # So we need to slice our tensor and uncond with the same index as original kdiff. # --- original code in kdiff --- # if not self.is_edit_model: # cond = torch.cat([tensor, uncond]) # else: # cond = torch.cat([tensor, uncond, uncond]) # cond = cond[a:b] # ------------------------------ # The original kdiff code is to concat and then slice, but this cannot apply to # our custom prompt tensor when tensor.shape[1] != uncond.shape[1]. So we adapt it. cond_in, uncond_in = None, None # Slice the [prompt, neg prompt, (possibly) neg prompt] with [a:b] if not self.is_edit_model: if b <= tensor.shape[0]: cond_in = tensor[a:b] elif a >= tensor.shape[0]: cond_in = uncond[a-tensor.shape[0]:b-tensor.shape[0]] else: cond_in = tensor[a:] uncond_in = uncond[:b-tensor.shape[0]] else: if b <= tensor.shape[0]: cond_in = tensor[a:b] elif b > tensor.shape[0] and b <= tensor.shape[0] + uncond.shape[0]: if a>= tensor.shape[0]: cond_in = uncond[a-tensor.shape[0]:b-tensor.shape[0]] else: cond_in = tensor[a:] uncond_in = uncond[:b-tensor.shape[0]] else: if a >= tensor.shape[0] + uncond.shape[0]: cond_in = uncond[a-tensor.shape[0]-uncond.shape[0]:b-tensor.shape[0]-uncond.shape[0]] elif a >= tensor.shape[0]: cond_in = torch.cat([uncond[a-tensor.shape[0]:], uncond[:b-tensor.shape[0]-uncond.shape[0]]]) if uncond_in is None or tensor.shape[1] == uncond.shape[1]: # If the tensor can be passed to UNet in one go, do it. if uncond_in is not None: cond_in = torch.cat([cond_in, uncond_in]) self.set_controlnet_tensors(bbox_id, x_tile.shape[0]) return forward_func(x_tile, sigma_in, cond={ "c_crossattn": [cond_in], "c_concat": [self.image_cond_in[bbox_id]] }) else: # If not, we need to pass the tensor to UNet separately. x_out = torch.zeros_like(x_tile) cond_size = cond_in.shape[0] self.set_controlnet_tensors(bbox_id, cond_size) cond_out = forward_func( x_tile [:cond_size], sigma_in[:cond_size], cond={ "c_crossattn": [cond_in], "c_concat": [self.image_cond_in[bbox_id]] }) self.set_controlnet_tensors(bbox_id, uncond_in.shape[0]) uncond_out = forward_func( x_tile [cond_size:], sigma_in[cond_size:], cond={ "c_crossattn": [uncond_in], "c_concat": [self.image_cond_in[bbox_id]] }) x_out[:cond_size] = cond_out x_out[cond_size:] = uncond_out return x_out # If the original prompt is with different length, # kdiff will deal with the cond and uncond separately. # Hence we also deal with the tensor and uncond separately. # get the start and end index of the current batch if a < tensor.shape[0]: # Deal with custom prompt tensor if not self.is_edit_model: c_crossattn = [tensor[a:b]] else: c_crossattn = torch.cat([tensor[a:b]], uncond) self.set_controlnet_tensors(bbox_id, x_tile.shape[0]) # complete this batch. return forward_func( x_tile, sigma_in, cond={ "c_crossattn": c_crossattn, "c_concat": [self.image_cond_in[bbox_id]] }) else: # if the cond is finished, we need to process the uncond. self.set_controlnet_tensors(bbox_id, uncond.shape[0]) return forward_func( x_tile, sigma_in, cond={ "c_crossattn": [uncond], "c_concat": [self.image_cond_in[bbox_id]] }) @custom_bbox def ddim_custom_forward(self, x:Tensor, cond_in:CondDict, bbox:CustomBBox, ts, forward_func, *args, **kwargs ): ''' draw custom bbox ''' conds_list, tensor, uncond, image_conditioning = self.reconstruct_custom_cond(cond_in, bbox.cond, bbox.uncond, bbox) assert all([len(conds) == 1 for conds in conds_list]), \ 'composition via AND is not supported for DDIM/PLMS samplers' cond = tensor # for DDIM, shapes definitely match. So we dont need to do the same thing as in the KDIFF sampler. if uncond.shape[1] < cond.shape[1]: last_vector = uncond[:, -1:] last_vector_repeated = last_vector.repeat([1, cond.shape[1] - uncond.shape[1], 1]) uncond = torch.hstack([uncond, last_vector_repeated]) elif uncond.shape[1] > cond.shape[1]: uncond = uncond[:, :cond.shape[1]] # Wrap the image conditioning back up since the DDIM code can accept the dict directly. # Note that they need to be lists because it just concatenates them later. if image_conditioning is not None: cond = {"c_concat": [image_conditioning], "c_crossattn": [cond]} uncond = {"c_concat": [image_conditioning], "c_crossattn": [uncond]} # We cannot determine the batch size here for different methods, so delay it to the forward_func. return forward_func(x, cond, ts, unconditional_conditioning=uncond, *args, **kwargs) @controlnet def init_controlnet(self, controlnet_script, control_tensor_cpu): self.enable_controlnet = True self.controlnet_script = controlnet_script self.control_tensor_cpu = control_tensor_cpu self.control_tensor_batch = None self.control_params = None self.control_tensor_custom = [] self.reset_controlnet_tensors() self.prepare_controlnet_tensors() @controlnet def reset_controlnet_tensors(self): if self.control_tensor_batch is None: return for param_id in range(len(self.control_params)): self.control_params[param_id].hint_cond = self.org_control_tensor_batch[param_id] @controlnet def prepare_controlnet_tensors(self): ''' Crop the control tensor into tiles and cache them ''' if self.control_tensor_batch is not None: return if self.controlnet_script is None or self.control_params is not None: return latest_network = self.controlnet_script.latest_network if latest_network is None or not hasattr(latest_network, 'control_params'): return self.control_params = latest_network.control_params tensors = [param.hint_cond for param in latest_network.control_params] self.org_control_tensor_batch = tensors if len(tensors) == 0: return self.control_tensor_batch = [] for i in range(len(tensors)): control_tile_list = [] control_tensor = tensors[i] for bboxes in self.batched_bboxes: single_batch_tensors = [] for bbox in bboxes: if len(control_tensor.shape) == 3: control_tensor.unsqueeze_(0) control_tile = control_tensor[:, :, bbox[1]*opt_f:bbox[3]*opt_f, bbox[0]*opt_f:bbox[2]*opt_f] single_batch_tensors.append(control_tile) control_tile = torch.cat(single_batch_tensors, dim=0) if self.control_tensor_cpu: control_tile = control_tile.cpu() control_tile_list.append(control_tile) self.control_tensor_batch.append(control_tile_list) if len(self.custom_bboxes) > 0: custom_control_tile_list = [] for bbox in self.custom_bboxes: if len(control_tensor.shape) == 3: control_tensor.unsqueeze_(0) control_tile = control_tensor[:, :, bbox[1]*opt_f:bbox[3]*opt_f, bbox[0]*opt_f:bbox[2]*opt_f] if self.control_tensor_cpu: control_tile = control_tile.cpu() custom_control_tile_list.append(control_tile) self.control_tensor_custom.append(custom_control_tile_list) @controlnet def switch_controlnet_tensors(self, batch_id:int, x_batch_size:int, tile_batch_size:int, is_denoise=False): if self.control_tensor_batch is None: return for param_id in range(len(self.control_params)): control_tile = self.control_tensor_batch[param_id][batch_id] if self.is_kdiff: all_control_tile = [] for i in range(tile_batch_size): this_control_tile = [control_tile[i].unsqueeze(0)] * x_batch_size all_control_tile.append(torch.cat(this_control_tile, dim=0)) control_tile = torch.cat(all_control_tile, dim=0) else: control_tile = control_tile.repeat([x_batch_size if is_denoise else x_batch_size * 2, 1, 1, 1]) self.control_params[param_id].hint_cond = control_tile.to(devices.device) @controlnet def set_controlnet_tensors(self, bbox_id:int, repeat_size:int): if not len(self.control_tensor_custom): return for param_id in range(len(self.control_params)): control_tensor = self.control_tensor_custom[param_id][bbox_id].to(devices.device) self.control_params[param_id].hint_cond = control_tensor.repeat((repeat_size, 1, 1, 1))