diff --git a/MANUAL_TRAIN.md b/MANUAL_TRAIN.md index 673fda8..9a1d7c2 100644 --- a/MANUAL_TRAIN.md +++ b/MANUAL_TRAIN.md @@ -1,52 +1,52 @@ -# 단일폴더 수동 학습 사용 예시 +# I. 단일폴더 수동 학습 사용 예시 ## 1. 기본 사용 (자동 계산) ```bash -python train_single.py --folder ../dataset/training/01_alice +run-train-single --folder ../dataset/training/01_alice ``` ## 2. Epochs만 수동 지정 ```bash -python train_single.py --folder ../dataset/training/01_alice --epochs 25 +run-train-single --folder ../dataset/training/01_alice --epochs 25 ``` ## 3. 세밀한 조정 ```bash -python train_single.py \ - --folder ../dataset/training/01_alice \ - --epochs 30 \ - --repeats 50 \ - --lr 0.00015 \ - --dim 64 \ +run-train-single ^ + --folder ../dataset/training/01_alice ^ + --epochs 30 ^ + --repeats 50 ^ + --lr 0.00015 ^ + --dim 64 ^ --alpha 32 ``` ## 4. 고해상도 학습 ```bash -python train_single.py \ - --folder ../dataset/training/01_alice \ - --resolution 1024,1024 \ +run-train-single ^ + --folder ../dataset/training/01_alice ^ + --resolution 1024,1024 ^ --batch-size 1 ``` ## 5. 빠른 테스트 ```bash -python train_single.py \ - --folder ../dataset/training/01_alice \ - --epochs 5 \ - --repeats 10 \ +run-train-single ^ + --folder ../dataset/training/01_alice ^ + --epochs 5 ^ + --repeats 10 ^ --save-every 1 ``` ## 6. 완전 수동 모드 ```bash -python train_single.py \ - --folder ../dataset/training/01_alice \ - --no-auto \ - --epochs 20 \ - --repeats 30 \ - --lr 0.0001 \ - --optimizer AdamW8bit \ +run-train-single ^ + --folder ../dataset/training/01_alice ^ + --no-auto ^ + --epochs 20 ^ + --repeats 30 ^ + --lr 0.0001 ^ + --optimizer AdamW8bit ^ --scheduler cosine ``` @@ -84,7 +84,7 @@ python train_batch.py ### train_single.py (단일 수동) ```bash # 특정 폴더만 세밀 조정 -python train_single.py --folder ../dataset/training/mainchar/01_alice --epochs 30 --lr 0.00015 +run-train-single --folder ../dataset/training/mainchar/01_alice --epochs 30 --lr 0.00015 → alice만 커스텀 파라미터로 학습 ``` @@ -96,19 +96,130 @@ python train_single.py --folder ../dataset/training/mainchar/01_alice --epochs 3 python train_batch.py # 2. 결과가 좋지 않은 캐릭터만 재학습 -python train_single.py --folder ../dataset/training/mainchar/01_alice --epochs 25 +run-train-single --folder ../dataset/training/mainchar/01_alice --epochs 25 ``` ### 고급 사용자 ```bash # 처음부터 세밀하게 조정 -python train_single.py \ - --folder ../dataset/training/mainchar/01_alice \ - --epochs 30 \ - --repeats 50 \ - --lr 0.00012 \ - --dim 64 \ - --alpha 32 \ - --optimizer Prodigy \ +run-train-single ^ + --folder ../dataset/training/mainchar/01_alice ^ + --epochs 30 ^ + --repeats 50 ^ + --lr 0.00012 ^ + --dim 64 ^ + --alpha 32 ^ + --optimizer Prodigy ^ --resolution 1024,1024 ``` + +# II. 단일폴더 학습재개(resume) 방법 + +## 1. 기본 Resume +```cmd +run-train-single --folder ../dataset/training/mainchar/01_alice --resume ../output_models/alice-epoch-010.safetensors +``` + +## 2. Resume + Learning Rate 조정 (Fine-tuning) +```cmd +run-train-single --folder ../dataset/training/mainchar/01_alice ^ + --folder ../dataset/training/01_alice ^ + --resume ../output_models/alice-epoch-010.safetensors ^ + --epochs 20 ^ + --lr 0.00005 +``` + +## 3. Resume + 더 많은 데이터 +```cmd +run-train-single --folder ../dataset/training/mainchar/01_alice ^ + --folder ../dataset/training/01_alice_more ^ + --resume ../output_models/alice-epoch-015.safetensors ^ + --epochs 10 +``` + +## 주의사항 +✅ Resume 시 동일하게 유지해야 할 것 + +- --dim (network_dim) +- --alpha (network_alpha) +- 네트워크 구조 관련 설정 + +## ⚠️ Resume 시 변경 가능한 것 + +- --epochs (더 학습) +- --lr (learning rate 조정) +- --repeats (데이터 반복) +- --optimizer (optimizer 변경) +- --scheduler (스케줄러 변경) + +## ❌ Resume 시 변경하면 안되는 것 +```cmd +# 잘못된 예 +run-train-single \ + --folder ../dataset/training/01_alice \ + --resume ../output_models/alice-epoch-010.safetensors \ + --dim 64 # ❌ 원래 32였으면 에러! +``` + +## 실전 예시 + +### 시나리오 1: 학습이 중단됨 +```cmd +# 10 epoch에서 중단 +# → 10 epoch부터 이어서 15 epoch까지 + +run-train-single ^ + --folder ../dataset/training/01_alice ^ + --resume ../output_models/alice-epoch-010.safetensors ^ + --epochs 15 +``` + +### 시나리오 2: Overfitting 방지 (LR 감소) +```cmd +# 학습률 낮춰서 Fine-tuning +run-train-single ^ + --folder ../dataset/training/01_alice ^ + --resume ../output_models/alice-epoch-015.safetensors ^ + --epochs 25 ^ + --lr 0.00005 +``` + +### 시나리오 3: 데이터 추가 후 재학습 +```cmd +# 이미지 20장 → 50장으로 증가 +run-train-single ^ + --folder ../dataset/training/01_alice_extended ^ + --resume ../output_models/alice-epoch-015.safetensors ^ + --epochs 10 ^ + --repeats 20 +``` + +## 출력 예시 +``` +====================================================================== +🎯 SDXL LoRA Training - Single Mode +====================================================================== +📁 Folder: ../dataset/training/01_alice +💾 Output: alice.safetensors +📋 Config: config-24g.json +🖥️ GPU: 0 (24GB VRAM) +⚡ Precision: bf16 +🔄 Resume from: ../output_models/alice-epoch-010.safetensors +---------------------------------------------------------------------- +📊 Training Parameters +---------------------------------------------------------------------- + Images: 25 + Repeats: 48 (auto) + Epochs: 20 (manual) + Batch size: 1 + Images/epoch: 1200 + Steps/epoch: 1200 + Total steps: 24000 +====================================================================== + +학습을 시작하시겠습니까? (y/N): y + +🔄 Resuming from: ../output_models/alice-epoch-010.safetensors + +🚀 Starting training... +``` \ No newline at end of file diff --git a/resume-train.cmd b/resume-train.cmd new file mode 100644 index 0000000..67a9646 --- /dev/null +++ b/resume-train.cmd @@ -0,0 +1,31 @@ +accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16 ^ + sdxl_train_network.py ^ + --resume="../output_models" ^ + --max_train_epochs=20 ^ + --pretrained_model_name_or_path="../models/sd_xl_base_1.0.safetensors" ^ + --train_data_dir="../dataset/train/mainchar" ^ + --output_dir="../output_models" ^ + --logging_dir="../logs" ^ + --output_name="karina" ^ + --network_module=networks.lora ^ + --network_dim=32 ^ + --network_alpha=16 ^ + --learning_rate=1e-4 ^ + --optimizer_type="AdamW8bit" ^ + --lr_scheduler="cosine" ^ + --lr_warmup_steps=100 ^ + --save_every_n_epochs=1 ^ + --mixed_precision="bf16" ^ + --save_precision="bf16" ^ + --cache_latents ^ + --cache_latents_to_disk ^ + --gradient_checkpointing ^ + --xformers ^ + --seed=42 ^ + --bucket_no_upscale ^ + --min_bucket_reso=512 ^ + --max_bucket_reso=2048 ^ + --bucket_reso_steps=64 ^ + --resolution="1024,1024" ^ + --network_train_unet_only ^ + --cache_text_encoder_outputs \ No newline at end of file diff --git a/run-caption-watcher.cmd b/run-caption-watcher.cmd new file mode 100644 index 0000000..6097b88 --- /dev/null +++ b/run-caption-watcher.cmd @@ -0,0 +1,6 @@ +@echo off +setx CUDA_VISIBLE_DEVICES "3" +echo [Watcher] Starting caption watcher... +python cap-watcher.py --overwrite +REM --img_dir "../dataset/captioning/mainchar" --out_dir "../dataset/captioning/mainchar" +pause \ No newline at end of file diff --git a/run-train.cmd b/run-train-auto.cmd similarity index 60% rename from run-train.cmd rename to run-train-auto.cmd index 6e22b42..6152325 100644 --- a/run-train.cmd +++ b/run-train-auto.cmd @@ -1,5 +1,6 @@ @echo off REM 첫 번째 argument를 명령어로 받아서 컨테이너에서 실행 REM 모든 argument를 그대로 넘기려면 %* 사용 +docker exec -it sdxl_train_captioner bash -c "cd /app/sdxl_train_captioner/sd-scripts; ./run-train-auto.py 1 2>&1 | tee /app/sdxl_train_captioner/logs/train_$(date +%%Y%%m%%d_%%H%%M%%S).log" -docker exec -it sdxl_train_captioner bash -c "cd /app/sdxl_train_captioner/sd-scripts; ./run-train.sh config-24g.json 1 2>&1 | tee /app/sdxl_train_captioner/logs/train_$(date +%%Y%%m%%d_%%H%%M%%S).log" +pause \ No newline at end of file diff --git a/run-train-auto.py b/run-train-auto.py deleted file mode 100644 index dafded4..0000000 --- a/run-train-auto.py +++ /dev/null @@ -1,368 +0,0 @@ -#!/usr/bin/env python3 -""" -SDXL LoRA 일괄 학습 스크립트 -- 학습 폴더 하위의 여러 캐릭터/개념을 자동으로 개별 LoRA 학습 -- VRAM에 따른 자동 설정 (bf16/fp16) -- 이미지 수에 따른 최적 파라미터 자동 계산 -""" - -import os -import sys -import json -import subprocess -import argparse -from pathlib import Path - - -class TrainingConfig: - """학습 설정 관리""" - - def __init__(self, config_file, gpu_id=0, force_repeats=None): - self.config_file = config_file - self.gpu_id = gpu_id - self.force_repeats = force_repeats - - # VRAM 감지 - self.vram_size = self.get_vram_size() - - # VRAM에 따른 설정 - if self.vram_size >= 20: - self.precision = "bf16" - self.target_steps = 1800 - else: - # 16GB 이하는 fp16 config 사용 - self.config_file = "config-16g.json" - self.precision = "fp16" - self.target_steps = 1500 - - # Config 파일 로드 - self.load_config() - - def get_vram_size(self): - """NVIDIA GPU VRAM 크기 감지 (GB)""" - try: - cmd = [ - "nvidia-smi", - "--query-gpu=memory.total", - "--format=csv,noheader,nounits", - f"-i {self.gpu_id}" - ] - result = subprocess.run( - ' '.join(cmd), - shell=True, - capture_output=True, - text=True - ) - vram_mb = int(result.stdout.strip()) - vram_gb = vram_mb // 1024 - return vram_gb - except Exception as e: - print(f"⚠️ VRAM 감지 실패, 기본값(24GB) 사용: {e}") - return 24 - - def load_config(self): - """config.json 로드""" - if not os.path.exists(self.config_file): - print(f"❌ Config 파일 없음: {self.config_file}") - sys.exit(1) - - with open(self.config_file, 'r', encoding='utf-8') as f: - self.config = json.load(f) - - self.train_dir = self.config['folders']['train_data_dir'] - self.output_dir = self.config['folders']['output_dir'] - self.batch_size = self.config['training'].get('batch_size', 1) - - -class LoRATrainer: - """단일 LoRA 학습 실행""" - - def __init__(self, training_config): - self.config = training_config - self.image_extensions = {'.jpg', '.jpeg', '.png', '.webp', '.bmp'} - - def find_training_folders(self): - """학습 폴더 찾기 (순서_이름 패턴)""" - train_dir = self.config.train_dir - - if not os.path.isdir(train_dir): - print(f"❌ 학습 디렉토리 없음: {train_dir}") - return [] - - folders = [] - for item in os.listdir(train_dir): - item_path = os.path.join(train_dir, item) - if not os.path.isdir(item_path): - continue - - # 패턴: 01_alice, 02_bob 등 - parts = item.split('_', 1) - if len(parts) == 2 and parts[0].isdigit(): - order = int(parts[0]) - name = parts[1] - folders.append({ - 'order': order, - 'name': name, - 'path': item_path, - 'folder': item - }) - - # 순서대로 정렬 - folders.sort(key=lambda x: x['order']) - return folders - - def count_images(self, folder_path): - """폴더 내 이미지 개수 세기""" - count = 0 - for file in os.listdir(folder_path): - if Path(file).suffix.lower() in self.image_extensions: - count += 1 - return count - - def calculate_training_params(self, image_count): - """이미지 수에 따른 최적 학습 파라미터 계산""" - batch_size = self.config.batch_size - target_steps = self.config.target_steps - - # 강제 반복 횟수가 지정되면 사용 - if self.config.force_repeats is not None: - optimal_repeats = self.config.force_repeats - else: - # 이미지 수에 따른 자동 계산 - if image_count < 20: - optimal_repeats = max(80, min(200, target_steps // (image_count * 10))) - elif image_count < 50: - optimal_repeats = max(30, min(80, target_steps // (image_count * 10))) - elif image_count < 100: - optimal_repeats = max(15, min(30, target_steps // (image_count * 10))) - else: - optimal_repeats = max(5, min(20, target_steps // (image_count * 10))) - - # Epochs 계산 - images_per_epoch = image_count * optimal_repeats - steps_per_epoch = images_per_epoch // batch_size - actual_epochs = max(1, round(target_steps / steps_per_epoch)) - actual_epochs = min(max(actual_epochs, 5), 30) - actual_total_steps = actual_epochs * steps_per_epoch - - return { - 'repeats': optimal_repeats, - 'epochs': actual_epochs, - 'steps_per_epoch': steps_per_epoch, - 'total_steps': actual_total_steps - } - - def train_single_lora(self, folder_info): - """단일 LoRA 학습 실행""" - name = folder_info['name'] - folder_path = folder_info['path'] - - print(f"\n{'=' * 70}") - print(f"🎯 Training LoRA: {name}") - print(f"{'=' * 70}") - - # 이미지 개수 확인 - image_count = self.count_images(folder_path) - if image_count == 0: - print(f"⚠️ 이미지 없음: {folder_path}") - print(f"{'=' * 70}\n") - return False - - # 파라미터 계산 - params = self.calculate_training_params(image_count) - - # 정보 출력 - print(f"📊 Training Configuration") - print(f"{'-' * 70}") - print(f" GPU ID: {self.config.gpu_id}") - print(f" VRAM: {self.config.vram_size}GB") - print(f" Precision: {self.config.precision}") - print(f" Config: {self.config.config_file}") - print(f" Folder: {folder_info['folder']}") - print(f" Images: {image_count}") - print(f" Repeats: {params['repeats']}" + - (" (forced)" if self.config.force_repeats else " (auto)")) - print(f" Images/epoch: {image_count * params['repeats']}") - print(f" Steps/epoch: {params['steps_per_epoch']}") - print(f" Epochs: {params['epochs']}") - print(f" Total steps: {params['total_steps']}") - print(f"{'-' * 70}\n") - - # accelerate 명령어 구성 - cmd = [ - "accelerate", "launch", - "--num_cpu_threads_per_process", "1", - "--mixed_precision", self.config.precision, - "sdxl_train_network.py", - f"--config_file={self.config.config_file}", - f"--train_data_dir={folder_path}", - f"--output_name={name}", - f"--max_train_epochs={params['epochs']}", - f"--dataset_repeats={params['repeats']}", - f"--mixed_precision={self.config.precision}" - ] - - # 실행 - try: - env = os.environ.copy() - env['CUDA_VISIBLE_DEVICES'] = str(self.config.gpu_id) - - print(f"🚀 Starting training...\n") - result = subprocess.run(cmd, env=env, check=True) - - print(f"\n✅ {name} 학습 완료!") - print(f"{'=' * 70}\n") - return True - - except subprocess.CalledProcessError as e: - print(f"\n❌ {name} 학습 실패: {e}") - print(f"{'=' * 70}\n") - return False - except KeyboardInterrupt: - print(f"\n⚠️ 사용자에 의해 중단됨") - return False - - def run_batch_training(self): - """일괄 학습 실행""" - folders = self.find_training_folders() - - if not folders: - print("❌ 학습 폴더를 찾을 수 없습니다!") - print(f" 경로: {self.config.train_dir}") - print(f" 패턴: 01_name, 02_name, ...") - return - - print(f"\n{'=' * 70}") - print(f"🚀 SDXL LoRA Batch Training") - print(f"{'=' * 70}") - print(f"📁 학습 폴더: {self.config.train_dir}") - print(f"💾 출력 폴더: {self.config.output_dir}") - print(f"🖥️ GPU: {self.config.gpu_id} ({self.config.vram_size}GB)") - print(f"⚡ Precision: {self.config.precision}") - print(f"📋 Config: {self.config.config_file}") - print(f"\n발견된 학습 폴더: {len(folders)}개") - print(f"{'-' * 70}") - for f in folders: - img_count = self.count_images(f['path']) - print(f" {f['order']:02d}. {f['name']:20s} ({img_count} images)") - print(f"{'=' * 70}\n") - - # 사용자 확인 - try: - response = input("학습을 시작하시겠습니까? (y/N): ") - if response.lower() not in ['y', 'yes']: - print("❌ 학습 취소됨") - return - except KeyboardInterrupt: - print("\n❌ 학습 취소됨") - return - - # 학습 실행 - results = [] - for i, folder in enumerate(folders, 1): - print(f"\n[{i}/{len(folders)}] Processing: {folder['name']}...") - success = self.train_single_lora(folder) - results.append({ - 'name': folder['name'], - 'success': success - }) - - # 실패 시 계속 진행할지 물어봄 - if not success: - try: - response = input("❓ 계속 진행하시겠습니까? (Y/n): ") - if response.lower() in ['n', 'no']: - print("⚠️ 나머지 학습 건너뜀") - break - except KeyboardInterrupt: - print("\n⚠️ 나머지 학습 건너뜀") - break - - # 결과 요약 - print(f"\n{'=' * 70}") - print(f"📊 Training Summary") - print(f"{'=' * 70}") - success_count = sum(1 for r in results if r['success']) - fail_count = len(results) - success_count - - for r in results: - status = "✅" if r['success'] else "❌" - print(f"{status} {r['name']}") - - print(f"{'-' * 70}") - print(f"✅ 성공: {success_count}/{len(results)}") - if fail_count > 0: - print(f"❌ 실패: {fail_count}/{len(results)}") - print(f"{'=' * 70}\n") - - -def main(): - parser = argparse.ArgumentParser( - description="SDXL LoRA 일괄 학습 스크립트", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -사용 예시: - python train_batch.py - python train_batch.py config-16g.json - python train_batch.py config-24g.json 0 15 - -폴더 구조: - training/ - ├── 01_alice/ - │ └── *.jpg - ├── 02_bob/ - │ └── *.jpg - └── 03_background/ - └── *.jpg - """ - ) - - parser.add_argument( - "config", - nargs="?", - default="config-24g.json", - help="Config 파일 (기본: config-24g.json)" - ) - - parser.add_argument( - "gpu_id", - nargs="?", - type=int, - default=0, - help="GPU ID (기본: 0)" - ) - - parser.add_argument( - "repeats", - nargs="?", - type=int, - default=None, - help="강제 반복 횟수 (기본: 자동 계산)" - ) - - args = parser.parse_args() - - try: - # 설정 로드 - training_config = TrainingConfig( - config_file=args.config, - gpu_id=args.gpu_id, - force_repeats=args.repeats - ) - - # 학습 실행 - trainer = LoRATrainer(training_config) - trainer.run_batch_training() - - except KeyboardInterrupt: - print("\n\n⚠️ 프로그램 중단됨") - sys.exit(1) - except Exception as e: - print(f"\n❌ 오류 발생: {e}") - import traceback - traceback.print_exc() - sys.exit(1) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/run-train-single.cmd b/run-train-single.cmd new file mode 100644 index 0000000..76e06e9 --- /dev/null +++ b/run-train-single.cmd @@ -0,0 +1,58 @@ +@echo off +setlocal enabledelayedexpansion + +REM =================================== +REM SDXL LoRA 단일 학습 (Windows → Docker) +REM =================================== + +REM 도움말 +if "%1"=="" ( + echo Usage: run-train-single.cmd --folder FOLDER [OPTIONS] + echo. + echo Examples: + echo run-train-single.cmd --folder ../dataset/training/01_alice + echo run-train-single.cmd --folder ../dataset/training/01_alice --epochs 25 + echo run-train-single.cmd --folder ../dataset/training/01_alice --lr 0.0002 --dim 64 + echo. + echo All arguments are passed to Python script inside container. + exit /b 1 +) + +REM 현재 시간 (로그 파일명용) +for /f "tokens=1-6 delims=/:. " %%a in ("%date% %time%") do ( + set timestamp=%%a%%b%%c_%%d%%e%%f +) +set timestamp=%timestamp: =0% + +REM 모든 arguments를 하나의 문자열로 결합 +set args=%* + +REM 작은따옴표 이스케이프 (Bash에서 안전하게) +set args=%args:'='\''% + +echo =================================== +echo Starting SDXL LoRA Training +echo =================================== +echo Arguments: %args% +echo Log file: train_%timestamp%.log +echo =================================== +echo. + +REM Docker에서 실행 +docker exec -it sdxl_train_captioner bash -c "cd /app/sdxl_train_captioner/sd-scripts && python run-train-single.py %args% 2>&1 | tee /app/sdxl_train_captioner/logs/train_%timestamp%.log" + +if %ERRORLEVEL% EQU 0 ( + echo. + echo =================================== + echo Training completed successfully! + echo =================================== +) else ( + echo. + echo =================================== + echo Training failed with error code: %ERRORLEVEL% + echo =================================== +) + +endlocal + +pause \ No newline at end of file diff --git a/run-train-single.ps1 b/run-train-single.ps1 new file mode 100644 index 0000000..60a20e7 --- /dev/null +++ b/run-train-single.ps1 @@ -0,0 +1,55 @@ +# =================================== +# SDXL LoRA 단일 학습 (PowerShell → Docker) +# =================================== + +param( + [Parameter(ValueFromRemainingArguments=$true)] + [string[]]$Arguments +) + +if ($Arguments.Count -eq 0) { + Write-Host "Usage: run-train-single.ps1 --folder FOLDER [OPTIONS]" -ForegroundColor Yellow + Write-Host "" + Write-Host "Examples:" -ForegroundColor Cyan + Write-Host " .\run-train-single.ps1 --folder ../dataset/training/01_alice" + Write-Host " .\run-train-single.ps1 --folder ../dataset/training/01_alice --epochs 25" + Write-Host " .\run-train-single.ps1 --folder ../dataset/training/01_alice --lr 0.0002 --dim 64" + exit 1 +} + +# 타임스탬프 +$timestamp = Get-Date -Format "yyyyMMdd_HHmmss" + +# Arguments를 문자열로 결합 +$argsString = $Arguments -join ' ' + +# 작은따옴표 이스케이프 +$argsString = $argsString -replace "'", "'\\''" + +Write-Host "===================================" -ForegroundColor Green +Write-Host "Starting SDXL LoRA Training" -ForegroundColor Green +Write-Host "===================================" -ForegroundColor Green +Write-Host "Arguments: $argsString" -ForegroundColor Cyan +Write-Host "Log file: train_$timestamp.log" -ForegroundColor Cyan +Write-Host "===================================" -ForegroundColor Green +Write-Host "" + +# Docker 명령어 +$dockerCmd = "cd /app/sdxl_train_captioner/sd-scripts && python run-train-single.py $argsString 2>&1 | tee /app/sdxl_train_captioner/logs/train_$timestamp.log" + +# 실행 +docker exec -it sdxl_train_captioner bash -c $dockerCmd + +if ($LASTEXITCODE -eq 0) { + Write-Host "" + Write-Host "===================================" -ForegroundColor Green + Write-Host "Training completed successfully!" -ForegroundColor Green + Write-Host "===================================" -ForegroundColor Green +} else { + Write-Host "" + Write-Host "===================================" -ForegroundColor Red + Write-Host "Training failed with error code: $LASTEXITCODE" -ForegroundColor Red + Write-Host "===================================" -ForegroundColor Red +} + +pause \ No newline at end of file diff --git a/run-train-single.py b/run-train-single.py deleted file mode 100644 index 8571421..0000000 --- a/run-train-single.py +++ /dev/null @@ -1,365 +0,0 @@ -#!/usr/bin/env python3 -""" -SDXL LoRA 단일 학습 스크립트 (고급 사용자용) -- 특정 폴더만 선택 학습 -- 세밀한 파라미터 조정 가능 -- Config 오버라이드 -""" - -import os -import sys -import json -import subprocess -import argparse -from pathlib import Path - - -def get_vram_size(gpu_id=0): - """NVIDIA GPU VRAM 크기 감지 (GB)""" - try: - cmd = f"nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits -i {gpu_id}" - result = subprocess.run(cmd, shell=True, capture_output=True, text=True) - vram_mb = int(result.stdout.strip()) - return vram_mb // 1024 - except: - return 24 # 기본값 - - -def count_images(folder_path): - """폴더 내 이미지 개수 세기""" - extensions = {'.jpg', '.jpeg', '.png', '.webp', '.bmp'} - count = 0 - for file in os.listdir(folder_path): - if Path(file).suffix.lower() in extensions: - count += 1 - return count - - -def calculate_auto_params(image_count, vram_size, batch_size=1): - """이미지 수 기반 자동 파라미터 계산""" - target_steps = 1800 if vram_size >= 20 else 1500 - - # Repeats 계산 - if image_count < 20: - repeats = max(80, min(200, target_steps // (image_count * 10))) - elif image_count < 50: - repeats = max(30, min(80, target_steps // (image_count * 10))) - elif image_count < 100: - repeats = max(15, min(30, target_steps // (image_count * 10))) - else: - repeats = max(5, min(20, target_steps // (image_count * 10))) - - # Epochs 계산 - images_per_epoch = image_count * repeats - steps_per_epoch = images_per_epoch // batch_size - epochs = max(1, round(target_steps / steps_per_epoch)) - epochs = min(max(epochs, 5), 30) - - return { - 'repeats': repeats, - 'epochs': epochs, - 'steps_per_epoch': steps_per_epoch, - 'total_steps': epochs * steps_per_epoch - } - - -def main(): - parser = argparse.ArgumentParser( - description="SDXL LoRA 단일 학습 (고급 설정)", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -사용 예시: - # 기본 (자동 계산) - python train_single.py --folder ../dataset/training/01_alice - - # 수동 파라미터 지정 - python train_single.py --folder ../dataset/training/01_alice --epochs 20 --repeats 30 - - # Learning rate 조정 - python train_single.py --folder ../dataset/training/01_alice --lr 0.0002 - - # Network dim 변경 - python train_single.py --folder ../dataset/training/01_alice --dim 64 --alpha 32 - - # 전체 커스텀 - python train_single.py \\ - --folder ../dataset/training/01_alice \\ - --output alice_v2 \\ - --config config-24g.json \\ - --gpu 0 \\ - --epochs 25 \\ - --repeats 40 \\ - --lr 0.00015 \\ - --dim 64 \\ - --alpha 32 \\ - --batch-size 2 - """ - ) - - # 필수 인자 - parser.add_argument( - "--folder", - required=True, - help="학습할 폴더 경로 (예: ../dataset/training/01_alice)" - ) - - # 기본 설정 - parser.add_argument( - "--config", - default="config-24g.json", - help="Config 파일 (기본: config-24g.json)" - ) - - parser.add_argument( - "--output", - help="출력 LoRA 이름 (기본: 폴더명에서 추출)" - ) - - parser.add_argument( - "--gpu", - type=int, - default=0, - help="GPU ID (기본: 0)" - ) - - # 학습 파라미터 - parser.add_argument( - "--epochs", - type=int, - help="총 Epoch 수 (기본: 자동 계산)" - ) - - parser.add_argument( - "--repeats", - type=int, - help="이미지 반복 횟수 (기본: 자동 계산)" - ) - - parser.add_argument( - "--batch-size", - type=int, - help="배치 사이즈 (기본: config 값)" - ) - - parser.add_argument( - "--lr", - type=float, - help="Learning rate (기본: config 값, 보통 1e-4)" - ) - - parser.add_argument( - "--dim", - type=int, - help="Network dimension (기본: config 값, 보통 32)" - ) - - parser.add_argument( - "--alpha", - type=int, - help="Network alpha (기본: config 값, 보통 16)" - ) - - parser.add_argument( - "--resolution", - help="해상도 (예: 1024,1024 또는 768,768)" - ) - - parser.add_argument( - "--save-every", - type=int, - help="N epoch마다 저장 (기본: config 값)" - ) - - # 고급 옵션 - parser.add_argument( - "--optimizer", - help="Optimizer (예: AdamW8bit, Lion, Prodigy)" - ) - - parser.add_argument( - "--scheduler", - help="LR Scheduler (예: cosine, constant, polynomial)" - ) - - parser.add_argument( - "--no-auto", - action="store_true", - help="자동 계산 비활성화 (epochs/repeats 수동 지정 필수)" - ) - - args = parser.parse_args() - - # 폴더 확인 - folder_path = Path(args.folder) - if not folder_path.exists() or not folder_path.is_dir(): - print(f"❌ 폴더를 찾을 수 없습니다: {folder_path}") - sys.exit(1) - - # 이미지 개수 - image_count = count_images(folder_path) - if image_count == 0: - print(f"❌ 이미지가 없습니다: {folder_path}") - sys.exit(1) - - # VRAM 감지 - vram_size = get_vram_size(args.gpu) - - # Config 자동 선택 - if vram_size >= 20: - precision = "bf16" - if args.config == "config-24g.json": - config_file = "config-24g.json" - else: - precision = "fp16" - config_file = "config-16g.json" - print(f"⚠️ VRAM {vram_size}GB < 20GB, fp16 모드로 전환") - - # Config 로드 - if not os.path.exists(config_file): - print(f"❌ Config 파일 없음: {config_file}") - sys.exit(1) - - with open(config_file, 'r', encoding='utf-8') as f: - config = json.load(f) - - batch_size = args.batch_size or config['training'].get('batch_size', 1) - - # 출력 이름 결정 - if args.output: - output_name = args.output - else: - # 폴더명에서 추출 (01_alice → alice) - folder_name = folder_path.name - parts = folder_name.split('_', 1) - if len(parts) == 2 and parts[0].isdigit(): - output_name = parts[1] - else: - output_name = folder_name - - # 파라미터 결정 - if args.no_auto: - # 수동 모드 - if not args.epochs or not args.repeats: - print("❌ --no-auto 사용 시 --epochs와 --repeats 필수입니다") - sys.exit(1) - epochs = args.epochs - repeats = args.repeats - steps_per_epoch = (image_count * repeats) // batch_size - total_steps = epochs * steps_per_epoch - else: - # 자동 계산 (오버라이드 가능) - auto_params = calculate_auto_params(image_count, vram_size, batch_size) - epochs = args.epochs or auto_params['epochs'] - repeats = args.repeats or auto_params['repeats'] - steps_per_epoch = (image_count * repeats) // batch_size - total_steps = epochs * steps_per_epoch - - # 학습 정보 출력 - print(f"\n{'=' * 70}") - print(f"🎯 SDXL LoRA Training - Single Mode") - print(f"{'=' * 70}") - print(f"📁 Folder: {folder_path}") - print(f"💾 Output: {output_name}.safetensors") - print(f"📋 Config: {config_file}") - print(f"🖥️ GPU: {args.gpu} ({vram_size}GB VRAM)") - print(f"⚡ Precision: {precision}") - print(f"{'-' * 70}") - print(f"📊 Training Parameters") - print(f"{'-' * 70}") - print(f" Images: {image_count}") - print(f" Repeats: {repeats}" + (" (manual)" if args.repeats else " (auto)")) - print(f" Epochs: {epochs}" + (" (manual)" if args.epochs else " (auto)")) - print(f" Batch size: {batch_size}" + (" (override)" if args.batch_size else "")) - print(f" Images/epoch: {image_count * repeats}") - print(f" Steps/epoch: {steps_per_epoch}") - print(f" Total steps: {total_steps}") - - # 오버라이드된 파라미터 표시 - overrides = [] - if args.lr: - print(f" Learning rate: {args.lr} (override)") - overrides.append(('lr', args.lr)) - if args.dim: - print(f" Network dim: {args.dim} (override)") - overrides.append(('dim', args.dim)) - if args.alpha: - print(f" Network alpha: {args.alpha} (override)") - overrides.append(('alpha', args.alpha)) - if args.resolution: - print(f" Resolution: {args.resolution} (override)") - overrides.append(('resolution', args.resolution)) - if args.optimizer: - print(f" Optimizer: {args.optimizer} (override)") - overrides.append(('optimizer', args.optimizer)) - if args.scheduler: - print(f" LR Scheduler: {args.scheduler} (override)") - overrides.append(('scheduler', args.scheduler)) - if args.save_every: - print(f" Save every: {args.save_every} epochs (override)") - overrides.append(('save_every', args.save_every)) - - print(f"{'=' * 70}\n") - - # 사용자 확인 - try: - response = input("학습을 시작하시겠습니까? (y/N): ") - if response.lower() not in ['y', 'yes']: - print("❌ 학습 취소됨") - sys.exit(0) - except KeyboardInterrupt: - print("\n❌ 학습 취소됨") - sys.exit(0) - - # accelerate 명령어 구성 - cmd = [ - "accelerate", "launch", - "--num_cpu_threads_per_process", "1", - "--mixed_precision", precision, - "sdxl_train_network.py", - f"--config_file={config_file}", - f"--train_data_dir={folder_path}", - f"--output_name={output_name}", - f"--max_train_epochs={epochs}", - f"--dataset_repeats={repeats}", - f"--mixed_precision={precision}" - ] - - # 오버라이드 추가 - if args.batch_size: - cmd.append(f"--train_batch_size={args.batch_size}") - if args.lr: - cmd.append(f"--learning_rate={args.lr}") - if args.dim: - cmd.append(f"--network_dim={args.dim}") - if args.alpha: - cmd.append(f"--network_alpha={args.alpha}") - if args.resolution: - cmd.append(f"--resolution={args.resolution}") - if args.optimizer: - cmd.append(f"--optimizer_type={args.optimizer}") - if args.scheduler: - cmd.append(f"--lr_scheduler={args.scheduler}") - if args.save_every: - cmd.append(f"--save_every_n_epochs={args.save_every}") - - # 환경 변수 설정 - env = os.environ.copy() - env['CUDA_VISIBLE_DEVICES'] = str(args.gpu) - - # 실행 - try: - print(f"\n🚀 Starting training...\n") - subprocess.run(cmd, env=env, check=True) - print(f"\n✅ 학습 완료: {output_name}.safetensors") - print(f"{'=' * 70}\n") - - except subprocess.CalledProcessError as e: - print(f"\n❌ 학습 실패: {e}") - sys.exit(1) - except KeyboardInterrupt: - print(f"\n⚠️ 학습 중단됨") - sys.exit(1) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/sd-scripts b/sd-scripts index b97fa5f..49115c2 160000 --- a/sd-scripts +++ b/sd-scripts @@ -1 +1 @@ -Subproject commit b97fa5fcb884e19ab50fb21b80ea65bf1e416b5c +Subproject commit 49115c25e9a0ffc6479b8f35d3ac76ba5352ca5b