diff --git a/patches/external_pr/hypernetwork.py b/patches/external_pr/hypernetwork.py index f4f1122..1cc4e9e 100644 --- a/patches/external_pr/hypernetwork.py +++ b/patches/external_pr/hypernetwork.py @@ -16,6 +16,7 @@ import tqdm from modules import shared, sd_models, devices, processing, sd_samplers from modules.hypernetworks.hypernetwork import optimizer_dict, stack_conds, save_hypernetwork, report_statistics from modules.textual_inversion.learn_schedule import LearnRateScheduler +from modules.textual_inversion.textual_inversion import tensorboard_setup, tensorboard_add, tensorboard_add_image from .textual_inversion import validate_train_inputs, write_loss from ..hypernetwork import Hypernetwork, load_hypernetwork from . import sd_hijack_checkpoint @@ -24,7 +25,7 @@ from ..scheduler import CosineAnnealingWarmUpRestarts from .dataset import PersonalizedBase,PersonalizedDataLoader -def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, gradient_step, data_root, log_directory, +def train_hypernetwork(id_task, hypernetwork_name, learn_rate, batch_size, gradient_step, data_root, log_directory, training_width, training_height, steps, shuffle_tags, tag_drop_out, latent_sampling_method, create_image_every, save_hypernetwork_every, template_file, preview_from_txt2img, preview_prompt, preview_negative_prompt, preview_steps, preview_sampler_index, preview_cfg_scale, preview_seed, @@ -112,6 +113,7 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, gradient_step, if not isinstance(shared.loaded_hypernetwork, Hypernetwork): raise RuntimeError("Cannot perform training for Hypernetwork structure pipeline!") + shared.state.job = "train-hypernetwork" shared.state.textinfo = "Initializing hypernetwork training..." shared.state.job_count = steps @@ -142,7 +144,11 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, gradient_step, return hypernetwork, filename scheduler = LearnRateScheduler(learn_rate, steps, initial_step) - + if shared.opts.training_enable_tensorboard: + print("Tensorboard logging enabled") + tensorboard_writer = tensorboard_setup(log_directory) + else: + tensorboard_writer = None # dataset loading may take a while, so input validations and early returns should be done before this shared.state.textinfo = f"Preparing dataset from {html.escape(data_root)}..." @@ -163,11 +169,19 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, gradient_step, dl = PersonalizedDataLoader(ds, latent_sampling_method=latent_sampling_method, batch_size=ds.batch_size, pin_memory=pin_memory) + old_parallel_processing_allowed = shared.parallel_processing_allowed if unload: + shared.parallel_processing_allowed = False shared.sd_model.cond_stage_model.to(devices.cpu) shared.sd_model.first_stage_model.to(devices.cpu) + detach_grad = False # test code that removes EMA + if detach_grad: + shared.sd_model.cond_stage_model.requires_grad_(False) + shared.sd_model.first_stage_model.requires_grad_(False) + torch.cuda.empty_cache() + weights = hypernetwork.weights(True) # Here we use optimizer from saved HN, or we can specify as UI option. @@ -285,7 +299,8 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, gradient_step, epoch_num = hypernetwork.step // steps_per_epoch epoch_step = hypernetwork.step % steps_per_epoch - pbar.set_description(f"[Epoch {epoch_num}: {epoch_step + 1}/{steps_per_epoch}]loss: {loss_step:.7f}") + description = f"Training hypernetwork [Epoch {epoch_num}: {epoch_step + 1}/{steps_per_epoch}]loss: {loss_step:.7f}" + pbar.set_description(description) if hypernetwork_dir is not None and ((use_beta_scheduler and scheduler_beta.is_EOC(hypernetwork.step) and save_when_converge) or (save_hypernetwork_every > 0 and steps_done % save_hypernetwork_every == 0)): # Before saving, change name to match current checkpoint. hypernetwork_name_every = f'{hypernetwork_name}-{steps_done}' @@ -301,7 +316,11 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, gradient_step, "loss": f"{loss_step:.7f}", "learn_rate": optimizer.param_groups[0]['lr'] }) - + if shared.opts.training_enable_tensorboard: + epoch_num = hypernetwork.step // len(ds) + epoch_step = hypernetwork.step - (epoch_num * len(ds)) + 1 + mean_loss = sum(sum(x) for x in loss_dict.values()) / sum(len(x) for x in loss_dict.values()) + tensorboard_add(tensorboard_writer, loss=mean_loss, global_step=hypernetwork.step, step=epoch_step, learn_rate=scheduler.learn_rate, epoch_num=epoch_num) if images_dir is not None and (use_beta_scheduler and scheduler_beta.is_EOC(hypernetwork.step) and create_when_converge) or (create_image_every > 0 and steps_done % create_image_every == 0): forced_filename = f'{hypernetwork_name}-{steps_done}' last_saved_image = os.path.join(images_dir, forced_filename) @@ -341,6 +360,8 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, gradient_step, processed = processing.process_images(p) image = processed.images[0] if len(processed.images) > 0 else None + if shared.opts.training_enable_tensorboard and shared.opts.training_tensorboard_save_images: + tensorboard_add_image(tensorboard_writer, f"Validation at epoch {epoch_num}", image, hypernetwork.step) if unload: shared.sd_model.cond_stage_model.to(devices.cpu) @@ -352,7 +373,7 @@ def train_hypernetwork(hypernetwork_name, learn_rate, batch_size, gradient_step, if move_optimizer: optim_to(optimizer, devices.device) if image is not None: - shared.state.current_image = image + shared.state.assign_current_image(image) last_saved_image, last_text_info = images.save_image(image, images_dir, "", p.seed, p.prompt, shared.opts.samples_format, processed.infotexts[0], p=p, @@ -377,6 +398,7 @@ Last saved image: {html.escape(last_saved_image)}
pbar.leave = False pbar.close() hypernetwork.eval() + shared.parallel_processing_allowed = old_parallel_processing_allowed report_statistics(loss_dict) filename = os.path.join(shared.cmd_opts.hypernetwork_dir, f'{hypernetwork_name}.pt') hypernetwork.optimizer_name = optimizer_name diff --git a/patches/external_pr/textual_inversion.py b/patches/external_pr/textual_inversion.py index 31d340d..120687a 100644 --- a/patches/external_pr/textual_inversion.py +++ b/patches/external_pr/textual_inversion.py @@ -19,6 +19,8 @@ from modules.textual_inversion.image_embedding import caption_image_overlay, ins from modules.textual_inversion.learn_schedule import LearnRateScheduler from modules.textual_inversion.textual_inversion import save_embedding +from torch.utils.tensorboard import SummaryWriter +from modules.textual_inversion.textual_inversion import tensorboard_add, tensorboard_setup, tensorboard_add_scaler, tensorboard_add_image #apply OsError avoid here delayed_values = {} @@ -84,7 +86,7 @@ def validate_train_inputs(model_name, learn_rate, batch_size, gradient_step, dat assert log_directory, "Log directory is empty" -def train_embedding(embedding_name, learn_rate, batch_size, gradient_step, data_root, log_directory, training_width, +def train_embedding(id_task, embedding_name, learn_rate, batch_size, gradient_step, data_root, log_directory, training_width, training_height, steps, shuffle_tags, tag_drop_out, latent_sampling_method, create_image_every, save_embedding_every, template_file, save_image_with_stored_embedding, preview_from_txt2img, preview_prompt, preview_negative_prompt, preview_steps, preview_sampler_index, preview_cfg_scale, @@ -200,6 +202,12 @@ def train_embedding(embedding_name, learn_rate, batch_size, gradient_step, data_ scheduler = LearnRateScheduler(learn_rate, steps, initial_step) # dataset loading may take a while, so input validations and early returns should be done before this shared.state.textinfo = f"Preparing dataset from {html.escape(data_root)}..." + old_parallel_processing_allowed = shared.parallel_processing_allowed + + tensorboard_writer = None + if shared.opts.training_enable_tensorboard: + print("Tensorboard logging enabled") + tensorboard_writer = tensorboard_setup(log_directory) pin_memory = shared.opts.pin_memory shared.sd_model.cond_stage_model.to(devices.device) @@ -220,6 +228,7 @@ def train_embedding(embedding_name, learn_rate, batch_size, gradient_step, data_ batch_size=ds.batch_size, pin_memory=pin_memory) if unload: + shared.parallel_processing_allowed = False shared.sd_model.first_stage_model.to(devices.cpu) embedding.vec.requires_grad = True @@ -380,21 +389,19 @@ def train_embedding(embedding_name, learn_rate, batch_size, gradient_step, data_ processed = processing.process_images(p) image = processed.images[0] if len(processed.images) > 0 else None - if unload: - shared.sd_model.first_stage_model.to(devices.cpu) - torch.set_rng_state(rng_state) - if torch.cuda.is_available(): - torch.cuda.set_rng_state_all(cuda_rng_state) if move_optimizer: optim_to(optimizer, devices.device) if image is not None: - shared.state.current_image = image + shared.state.assign_current_image(image) last_saved_image, last_text_info = images.save_image(image, images_dir, "", p.seed, p.prompt, shared.opts.samples_format, processed.infotexts[0], p=p, forced_filename=forced_filename, save_to_dirs=False) last_saved_image += f", prompt: {preview_text}" + if shared.opts.training_enable_tensorboard and shared.opts.training_tensorboard_save_images: + tensorboard_add_image(tensorboard_writer, f"Validation at epoch {epoch_num}", image, + embedding.step) if save_image_with_stored_embedding and os.path.exists( last_saved_file) and embedding_yet_to_be_embedded: @@ -422,7 +429,11 @@ def train_embedding(embedding_name, learn_rate, batch_size, gradient_step, data_ captioned_image.save(last_saved_image_chunks, "PNG", pnginfo=info) embedding_yet_to_be_embedded = False - + if unload: + shared.sd_model.first_stage_model.to(devices.cpu) + torch.set_rng_state(rng_state) + if torch.cuda.is_available(): + torch.cuda.set_rng_state_all(cuda_rng_state) last_saved_image, last_text_info = images.save_image(image, images_dir, "", p.seed, p.prompt, shared.opts.samples_format, processed.infotexts[0], p=p, @@ -430,6 +441,7 @@ def train_embedding(embedding_name, learn_rate, batch_size, gradient_step, data_ save_to_dirs=False) last_saved_image += f", prompt: {preview_text}" + shared.state.job_no = embedding.step shared.state.textinfo = f""" @@ -450,5 +462,5 @@ Last saved image: {html.escape(last_saved_image)}
pbar.leave = False pbar.close() shared.sd_model.first_stage_model.to(devices.device) - + shared.parallel_processing_allowed = old_parallel_processing_allowed return embedding, filename \ No newline at end of file diff --git a/patches/external_pr/ui.py b/patches/external_pr/ui.py index a0646c1..44a819f 100644 --- a/patches/external_pr/ui.py +++ b/patches/external_pr/ui.py @@ -36,6 +36,7 @@ Hypernetwork saved to {html.escape(filename)} def on_train_gamma_tab(params=None): + dummy_component = gr.Label(visible=False) with gr.Tab(label="Train_Gamma") as train_gamma: gr.HTML( value="

Train an embedding or Hypernetwork; you must specify a directory [wiki]

") @@ -145,6 +146,7 @@ def on_train_gamma_tab(params=None): fn=wrap_gradio_gpu_call(train_embedding_external, extra_outputs=[gr.update()]), _js="start_training_textual_inversion", inputs=[ + dummy_component, train_embedding_name, embedding_learn_rate, batch_size, @@ -192,6 +194,7 @@ def on_train_gamma_tab(params=None): fn=wrap_gradio_gpu_call(train_hypernetwork_ui, extra_outputs=[gr.update()]), _js="start_training_textual_inversion", inputs=[ + dummy_component, train_hypernetwork_name, hypernetwork_learn_rate, batch_size, diff --git a/patches/hashes_backup.py b/patches/hashes_backup.py index f110b2f..1ace37f 100644 --- a/patches/hashes_backup.py +++ b/patches/hashes_backup.py @@ -7,7 +7,7 @@ import filelock # This is full copy of modules/hashes. This will be only loaded if compatibility issue happens due to version mismatch. cache_filename = "cache.json" cache_data = None - +blksize = 1 << 20 def dump_cache(): with filelock.FileLock(cache_filename+".lock"): @@ -34,9 +34,9 @@ def cache(subsection): def calculate_sha256(filename): hash_sha256 = hashlib.sha256() - + global blksize with open(filename, "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): + for chunk in iter(lambda: f.read(blksize), b""): hash_sha256.update(chunk) return hash_sha256.hexdigest()