Compare commits
125 Commits
| Author | SHA1 | Date |
|---|---|---|
|
|
443074eee9 | |
|
|
2e0503780d | |
|
|
00d2f4047d | |
|
|
c5d9edacd0 | |
|
|
47ccecaee0 | |
|
|
2327fa1c90 | |
|
|
084e08c6e2 | |
|
|
ef8f3cbcdc | |
|
|
6fbb6b6f49 | |
|
|
abf3d56f27 | |
|
|
2a14e1e96a | |
|
|
5edbdf4364 | |
|
|
3cdc0d523f | |
|
|
749d5b4e8d | |
|
|
e988df72f8 | |
|
|
0be87b082a | |
|
|
ec4b1659ab | |
|
|
cb388e2912 | |
|
|
9949c19c63 | |
|
|
cc6f9500a1 | |
|
|
db85cf03ff | |
|
|
91e1f45d80 | |
|
|
6045c11d8b | |
|
|
529c80255f | |
|
|
43a1263b60 | |
|
|
102773cd2c | |
|
|
1e1d4f1254 | |
|
|
eb22225387 | |
|
|
b38dd0ff23 | |
|
|
ad94d47221 | |
|
|
e75f775ae8 | |
|
|
c514890325 | |
|
|
543e9fba64 | |
|
|
fc5f4a996b | |
|
|
138571da95 | |
|
|
3d816db07f | |
|
|
b9dedea57d | |
|
|
3086026401 | |
|
|
9635c2ec9b | |
|
|
f8d92cf313 | |
|
|
4f48be4138 | |
|
|
541fd10bbe | |
|
|
05f7531148 | |
|
|
c033bbf516 | |
|
|
1391579c33 | |
|
|
d0c53c50c2 | |
|
|
b41ab53b6f | |
|
|
e9a2d1e4cc | |
|
|
1de83f91c3 | |
|
|
8f374716ee | |
|
|
cb0bbde402 | |
|
|
7ce3f64c78 | |
|
|
c5569e8627 | |
|
|
c16db7fd69 | |
|
|
fed4ac031a | |
|
|
35dfcbbb28 | |
|
|
722bc73319 | |
|
|
402ff1cdb7 | |
|
|
acd718598e | |
|
|
559501e4b8 | |
|
|
ee2db7488d | |
|
|
c2657d5fb9 | |
|
|
971932346a | |
|
|
31283d2892 | |
|
|
55ebd287ee | |
|
|
a2840e7552 | |
|
|
a134423890 | |
|
|
b920bdd77d | |
|
|
5410ed34f5 | |
|
|
e6be419a30 | |
|
|
3d4aca8084 | |
|
|
2d861fb146 | |
|
|
b615af1c65 | |
|
|
40862c0776 | |
|
|
50076f3439 | |
|
|
61c2387436 | |
|
|
7083484a48 | |
|
|
4b1444fc7a | |
|
|
8cbbea8f6a | |
|
|
13917b3880 | |
|
|
f21f6b2212 | |
|
|
eb0686bbb6 | |
|
|
5de94e70ec | |
|
|
76b75f3ad7 | |
|
|
0c63b4f6e3 | |
|
|
7d437687c2 | |
|
|
e2ddf28d78 | |
|
|
076639fed9 | |
|
|
55e6478526 | |
|
|
537c10d231 | |
|
|
8d723d2caa | |
|
|
d113d1cc32 | |
|
|
a500f1edac | |
|
|
3f77450ef1 | |
|
|
fc1fdf3389 | |
|
|
b353a7c863 | |
|
|
3696c5bad6 | |
|
|
3a56201da5 | |
|
|
6a2cdb817d | |
|
|
85b7495135 | |
|
|
225c52f6a4 | |
|
|
b1fdbeb9a7 | |
|
|
1dc64f3526 | |
|
|
359559c913 | |
|
|
8165485a17 | |
|
|
b0fd65e884 | |
|
|
2a1f402601 | |
|
|
3eba2dcf2d | |
|
|
404d7b9978 | |
|
|
6580a6bc01 | |
|
|
3b15651bc6 | |
|
|
a55835f10c | |
|
|
b53b10ea61 | |
|
|
7d5534d8e5 | |
|
|
5ebb0c2e0b | |
|
|
a0a64c679f | |
|
|
8e73678dae | |
|
|
c2862b24af | |
|
|
f9ec85f739 | |
|
|
2d5fd3f5dd | |
|
|
2d4970ff67 | |
|
|
e87858e974 | |
|
|
da6edb5a4e | |
|
|
6265a239f3 | |
|
|
d49420b3c7 |
|
|
@ -0,0 +1,2 @@
|
|||
.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build
|
||||
pause
|
||||
|
|
@ -20,29 +20,12 @@ jobs:
|
|||
git_tag: ${{ inputs.git_tag }}
|
||||
cache_tag: "cu130"
|
||||
python_minor: "13"
|
||||
python_patch: "11"
|
||||
python_patch: "12"
|
||||
rel_name: "nvidia"
|
||||
rel_extra_name: ""
|
||||
test_release: true
|
||||
secrets: inherit
|
||||
|
||||
release_nvidia_cu128:
|
||||
permissions:
|
||||
contents: "write"
|
||||
packages: "write"
|
||||
pull-requests: "read"
|
||||
name: "Release NVIDIA cu128"
|
||||
uses: ./.github/workflows/stable-release.yml
|
||||
with:
|
||||
git_tag: ${{ inputs.git_tag }}
|
||||
cache_tag: "cu128"
|
||||
python_minor: "12"
|
||||
python_patch: "10"
|
||||
rel_name: "nvidia"
|
||||
rel_extra_name: "_cu128"
|
||||
test_release: true
|
||||
secrets: inherit
|
||||
|
||||
release_nvidia_cu126:
|
||||
permissions:
|
||||
contents: "write"
|
||||
|
|
@ -76,3 +59,20 @@ jobs:
|
|||
rel_extra_name: ""
|
||||
test_release: false
|
||||
secrets: inherit
|
||||
|
||||
release_xpu:
|
||||
permissions:
|
||||
contents: "write"
|
||||
packages: "write"
|
||||
pull-requests: "read"
|
||||
name: "Release Intel XPU"
|
||||
uses: ./.github/workflows/stable-release.yml
|
||||
with:
|
||||
git_tag: ${{ inputs.git_tag }}
|
||||
cache_tag: "xpu"
|
||||
python_minor: "13"
|
||||
python_patch: "12"
|
||||
rel_name: "intel"
|
||||
rel_extra_name: ""
|
||||
test_release: true
|
||||
secrets: inherit
|
||||
|
|
|
|||
|
|
@ -21,6 +21,5 @@ venv*/
|
|||
*.log
|
||||
web_custom_versions/
|
||||
.DS_Store
|
||||
openapi.yaml
|
||||
filtered-openapi.yaml
|
||||
uv.lock
|
||||
|
|
|
|||
|
|
@ -139,9 +139,9 @@ Example:
|
|||
"_quantization_metadata": {
|
||||
"format_version": "1.0",
|
||||
"layers": {
|
||||
"model.layers.0.mlp.up_proj": "float8_e4m3fn",
|
||||
"model.layers.0.mlp.down_proj": "float8_e4m3fn",
|
||||
"model.layers.1.mlp.up_proj": "float8_e4m3fn"
|
||||
"model.layers.0.mlp.up_proj": {"format": "float8_e4m3fn"},
|
||||
"model.layers.0.mlp.down_proj": {"format": "float8_e4m3fn"},
|
||||
"model.layers.1.mlp.up_proj": {"format": "float8_e4m3fn"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -165,4 +165,4 @@ Activation quantization (e.g., for FP8 Tensor Core operations) requires `input_s
|
|||
3. **Compute scales**: Derive `input_scale` from collected statistics
|
||||
4. **Store in checkpoint**: Save `input_scale` parameters alongside weights
|
||||
|
||||
The calibration dataset should be representative of your target use case. For diffusion models, this typically means a diverse set of prompts and generation parameters.
|
||||
The calibration dataset should be representative of your target use case. For diffusion models, this typically means a diverse set of prompts and generation parameters.
|
||||
|
|
|
|||
11
README.md
11
README.md
|
|
@ -61,6 +61,7 @@ See what ComfyUI can do with the [newer template workflows](https://comfy.org/wo
|
|||
|
||||
## Features
|
||||
- Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
|
||||
- NOTE: There are many more models supported than the list below, if you want to see what is supported see our templates list inside ComfyUI.
|
||||
- Image Models
|
||||
- SD1.x, SD2.x ([unCLIP](https://comfyanonymous.github.io/ComfyUI_examples/unclip/))
|
||||
- [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/)
|
||||
|
|
@ -136,7 +137,7 @@ ComfyUI follows a weekly release cycle targeting Monday but this regularly chang
|
|||
- Builds a new release using the latest stable core version
|
||||
|
||||
3. **[ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend)**
|
||||
- Weekly frontend updates are merged into the core repository
|
||||
- Every 2+ weeks frontend updates are merged into the core repository
|
||||
- Features are frozen for the upcoming core release
|
||||
- Development continues for the next release cycle
|
||||
|
||||
|
|
@ -194,7 +195,9 @@ The portable above currently comes with python 3.13 and pytorch cuda 13.0. Updat
|
|||
|
||||
#### Alternative Downloads:
|
||||
|
||||
[Experimental portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
|
||||
[Portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
|
||||
|
||||
[Experimental portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z)
|
||||
|
||||
[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
|
||||
|
||||
|
|
@ -232,7 +235,7 @@ Put your VAE in: models/vae
|
|||
|
||||
AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:
|
||||
|
||||
```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm7.1```
|
||||
```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm7.2```
|
||||
|
||||
This is the command to install the nightly with ROCm 7.2 which might have some performance improvements:
|
||||
|
||||
|
|
@ -275,7 +278,7 @@ Nvidia users should install stable pytorch using this command:
|
|||
|
||||
This is the command to install pytorch nightly instead which might have performance improvements.
|
||||
|
||||
```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu130```
|
||||
```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu132```
|
||||
|
||||
#### Troubleshooting
|
||||
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ class InternalRoutes:
|
|||
(entry for entry in os.scandir(directory) if is_visible_file(entry)),
|
||||
key=lambda entry: -entry.stat().st_mtime
|
||||
)
|
||||
return web.json_response([entry.name for entry in sorted_files], status=200)
|
||||
return web.json_response([f"{entry.name} [{directory_type}]" for entry in sorted_files], status=200)
|
||||
|
||||
|
||||
def get_app(self):
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
from app.assets.database.queries.asset import (
|
||||
asset_exists_by_hash,
|
||||
bulk_insert_assets,
|
||||
create_stub_asset,
|
||||
get_asset_by_hash,
|
||||
get_existing_asset_ids,
|
||||
reassign_asset_references,
|
||||
|
|
@ -12,6 +13,7 @@ from app.assets.database.queries.asset_reference import (
|
|||
UnenrichedReferenceRow,
|
||||
bulk_insert_references_ignore_conflicts,
|
||||
bulk_update_enrichment_level,
|
||||
count_active_siblings,
|
||||
bulk_update_is_missing,
|
||||
bulk_update_needs_verify,
|
||||
convert_metadata_to_rows,
|
||||
|
|
@ -80,6 +82,8 @@ __all__ = [
|
|||
"bulk_insert_references_ignore_conflicts",
|
||||
"bulk_insert_tags_and_meta",
|
||||
"bulk_update_enrichment_level",
|
||||
"count_active_siblings",
|
||||
"create_stub_asset",
|
||||
"bulk_update_is_missing",
|
||||
"bulk_update_needs_verify",
|
||||
"convert_metadata_to_rows",
|
||||
|
|
|
|||
|
|
@ -78,6 +78,18 @@ def upsert_asset(
|
|||
return asset, created, updated
|
||||
|
||||
|
||||
def create_stub_asset(
|
||||
session: Session,
|
||||
size_bytes: int,
|
||||
mime_type: str | None = None,
|
||||
) -> Asset:
|
||||
"""Create a new asset with no hash (stub for later enrichment)."""
|
||||
asset = Asset(size_bytes=size_bytes, mime_type=mime_type, hash=None)
|
||||
session.add(asset)
|
||||
session.flush()
|
||||
return asset
|
||||
|
||||
|
||||
def bulk_insert_assets(
|
||||
session: Session,
|
||||
rows: list[dict],
|
||||
|
|
|
|||
|
|
@ -114,6 +114,23 @@ def get_reference_by_file_path(
|
|||
)
|
||||
|
||||
|
||||
def count_active_siblings(
|
||||
session: Session,
|
||||
asset_id: str,
|
||||
exclude_reference_id: str,
|
||||
) -> int:
|
||||
"""Count active (non-deleted) references to an asset, excluding one reference."""
|
||||
return (
|
||||
session.query(AssetReference)
|
||||
.filter(
|
||||
AssetReference.asset_id == asset_id,
|
||||
AssetReference.id != exclude_reference_id,
|
||||
AssetReference.deleted_at.is_(None),
|
||||
)
|
||||
.count()
|
||||
)
|
||||
|
||||
|
||||
def reference_exists_for_asset_id(
|
||||
session: Session,
|
||||
asset_id: str,
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ from app.assets.database.queries import (
|
|||
delete_references_by_ids,
|
||||
ensure_tags_exist,
|
||||
get_asset_by_hash,
|
||||
get_reference_by_id,
|
||||
get_references_for_prefixes,
|
||||
get_unenriched_references,
|
||||
mark_references_missing_outside_prefixes,
|
||||
|
|
@ -338,6 +339,7 @@ def build_asset_specs(
|
|||
"metadata": metadata,
|
||||
"hash": asset_hash,
|
||||
"mime_type": mime_type,
|
||||
"job_id": None,
|
||||
}
|
||||
)
|
||||
tag_pool.update(tags)
|
||||
|
|
@ -426,6 +428,7 @@ def enrich_asset(
|
|||
except OSError:
|
||||
return new_level
|
||||
|
||||
initial_mtime_ns = get_mtime_ns(stat_p)
|
||||
rel_fname = compute_relative_filename(file_path)
|
||||
mime_type: str | None = None
|
||||
metadata = None
|
||||
|
|
@ -489,6 +492,18 @@ def enrich_asset(
|
|||
except Exception as e:
|
||||
logging.warning("Failed to hash %s: %s", file_path, e)
|
||||
|
||||
# Optimistic guard: if the reference's mtime_ns changed since we
|
||||
# started (e.g. ingest_existing_file updated it), our results are
|
||||
# stale — discard them to avoid overwriting fresh registration data.
|
||||
ref = get_reference_by_id(session, reference_id)
|
||||
if ref is None or ref.mtime_ns != initial_mtime_ns:
|
||||
session.rollback()
|
||||
logging.info(
|
||||
"Ref %s mtime changed during enrichment, discarding stale result",
|
||||
reference_id,
|
||||
)
|
||||
return ENRICHMENT_STUB
|
||||
|
||||
if extract_metadata and metadata:
|
||||
system_metadata = metadata.to_user_metadata()
|
||||
set_reference_system_metadata(session, reference_id, system_metadata)
|
||||
|
|
|
|||
|
|
@ -77,7 +77,9 @@ class _AssetSeeder:
|
|||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._lock = threading.Lock()
|
||||
# RLock is required because _run_scan() drains pending work while
|
||||
# holding _lock and re-enters start() which also acquires _lock.
|
||||
self._lock = threading.RLock()
|
||||
self._state = State.IDLE
|
||||
self._progress: Progress | None = None
|
||||
self._last_progress: Progress | None = None
|
||||
|
|
@ -92,6 +94,7 @@ class _AssetSeeder:
|
|||
self._prune_first: bool = False
|
||||
self._progress_callback: ProgressCallback | None = None
|
||||
self._disabled: bool = False
|
||||
self._pending_enrich: dict | None = None
|
||||
|
||||
def disable(self) -> None:
|
||||
"""Disable the asset seeder, preventing any scans from starting."""
|
||||
|
|
@ -196,6 +199,42 @@ class _AssetSeeder:
|
|||
compute_hashes=compute_hashes,
|
||||
)
|
||||
|
||||
def enqueue_enrich(
|
||||
self,
|
||||
roots: tuple[RootType, ...] = ("models", "input", "output"),
|
||||
compute_hashes: bool = False,
|
||||
) -> bool:
|
||||
"""Start an enrichment scan now, or queue it for after the current scan.
|
||||
|
||||
If the seeder is idle, starts immediately. Otherwise, the enrich
|
||||
request is stored and will run automatically when the current scan
|
||||
finishes.
|
||||
|
||||
Args:
|
||||
roots: Tuple of root types to scan
|
||||
compute_hashes: If True, compute blake3 hashes
|
||||
|
||||
Returns:
|
||||
True if started immediately, False if queued for later
|
||||
"""
|
||||
with self._lock:
|
||||
if self.start_enrich(roots=roots, compute_hashes=compute_hashes):
|
||||
return True
|
||||
if self._pending_enrich is not None:
|
||||
existing_roots = set(self._pending_enrich["roots"])
|
||||
existing_roots.update(roots)
|
||||
self._pending_enrich["roots"] = tuple(existing_roots)
|
||||
self._pending_enrich["compute_hashes"] = (
|
||||
self._pending_enrich["compute_hashes"] or compute_hashes
|
||||
)
|
||||
else:
|
||||
self._pending_enrich = {
|
||||
"roots": roots,
|
||||
"compute_hashes": compute_hashes,
|
||||
}
|
||||
logging.info("Enrich scan queued (roots=%s)", self._pending_enrich["roots"])
|
||||
return False
|
||||
|
||||
def cancel(self) -> bool:
|
||||
"""Request cancellation of the current scan.
|
||||
|
||||
|
|
@ -381,9 +420,13 @@ class _AssetSeeder:
|
|||
return marked
|
||||
finally:
|
||||
with self._lock:
|
||||
self._last_progress = self._progress
|
||||
self._state = State.IDLE
|
||||
self._progress = None
|
||||
self._reset_to_idle()
|
||||
|
||||
def _reset_to_idle(self) -> None:
|
||||
"""Reset state to IDLE, preserving last progress. Caller must hold _lock."""
|
||||
self._last_progress = self._progress
|
||||
self._state = State.IDLE
|
||||
self._progress = None
|
||||
|
||||
def _is_cancelled(self) -> bool:
|
||||
"""Check if cancellation has been requested."""
|
||||
|
|
@ -594,9 +637,18 @@ class _AssetSeeder:
|
|||
},
|
||||
)
|
||||
with self._lock:
|
||||
self._last_progress = self._progress
|
||||
self._state = State.IDLE
|
||||
self._progress = None
|
||||
self._reset_to_idle()
|
||||
pending = self._pending_enrich
|
||||
if pending is not None:
|
||||
self._pending_enrich = None
|
||||
if not self.start_enrich(
|
||||
roots=pending["roots"],
|
||||
compute_hashes=pending["compute_hashes"],
|
||||
):
|
||||
logging.warning(
|
||||
"Pending enrich scan could not start (roots=%s)",
|
||||
pending["roots"],
|
||||
)
|
||||
|
||||
def _run_fast_phase(self, roots: tuple[RootType, ...]) -> tuple[int, int, int]:
|
||||
"""Run phase 1: fast scan to create stub records.
|
||||
|
|
|
|||
|
|
@ -23,6 +23,8 @@ from app.assets.services.ingest import (
|
|||
DependencyMissingError,
|
||||
HashMismatchError,
|
||||
create_from_hash,
|
||||
ingest_existing_file,
|
||||
register_output_files,
|
||||
upload_from_temp_path,
|
||||
)
|
||||
from app.assets.database.queries import (
|
||||
|
|
@ -72,6 +74,8 @@ __all__ = [
|
|||
"delete_asset_reference",
|
||||
"get_asset_by_hash",
|
||||
"get_asset_detail",
|
||||
"ingest_existing_file",
|
||||
"register_output_files",
|
||||
"get_mtime_ns",
|
||||
"get_size_and_mtime_ns",
|
||||
"list_assets_page",
|
||||
|
|
|
|||
|
|
@ -37,6 +37,7 @@ class SeedAssetSpec(TypedDict):
|
|||
metadata: ExtractedMetadata | None
|
||||
hash: str | None
|
||||
mime_type: str | None
|
||||
job_id: str | None
|
||||
|
||||
|
||||
class AssetRow(TypedDict):
|
||||
|
|
@ -60,6 +61,7 @@ class ReferenceRow(TypedDict):
|
|||
name: str
|
||||
preview_id: str | None
|
||||
user_metadata: dict[str, Any] | None
|
||||
job_id: str | None
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
last_access_time: datetime
|
||||
|
|
@ -167,6 +169,7 @@ def batch_insert_seed_assets(
|
|||
"name": spec["info_name"],
|
||||
"preview_id": None,
|
||||
"user_metadata": user_metadata,
|
||||
"job_id": spec.get("job_id"),
|
||||
"created_at": current_time,
|
||||
"updated_at": current_time,
|
||||
"last_access_time": current_time,
|
||||
|
|
|
|||
|
|
@ -9,6 +9,9 @@ from sqlalchemy.orm import Session
|
|||
import app.assets.services.hashing as hashing
|
||||
from app.assets.database.queries import (
|
||||
add_tags_to_reference,
|
||||
count_active_siblings,
|
||||
create_stub_asset,
|
||||
ensure_tags_exist,
|
||||
fetch_reference_and_asset,
|
||||
get_asset_by_hash,
|
||||
get_reference_by_file_path,
|
||||
|
|
@ -23,7 +26,8 @@ from app.assets.database.queries import (
|
|||
upsert_reference,
|
||||
validate_tags_exist,
|
||||
)
|
||||
from app.assets.helpers import normalize_tags
|
||||
from app.assets.helpers import get_utc_now, normalize_tags
|
||||
from app.assets.services.bulk_ingest import batch_insert_seed_assets
|
||||
from app.assets.services.file_utils import get_size_and_mtime_ns
|
||||
from app.assets.services.path_utils import (
|
||||
compute_relative_filename,
|
||||
|
|
@ -130,6 +134,102 @@ def _ingest_file_from_path(
|
|||
)
|
||||
|
||||
|
||||
def register_output_files(
|
||||
file_paths: Sequence[str],
|
||||
user_metadata: UserMetadata = None,
|
||||
job_id: str | None = None,
|
||||
) -> int:
|
||||
"""Register a batch of output file paths as assets.
|
||||
|
||||
Returns the number of files successfully registered.
|
||||
"""
|
||||
registered = 0
|
||||
for abs_path in file_paths:
|
||||
if not os.path.isfile(abs_path):
|
||||
continue
|
||||
try:
|
||||
if ingest_existing_file(
|
||||
abs_path, user_metadata=user_metadata, job_id=job_id
|
||||
):
|
||||
registered += 1
|
||||
except Exception:
|
||||
logging.exception("Failed to register output: %s", abs_path)
|
||||
return registered
|
||||
|
||||
|
||||
def ingest_existing_file(
|
||||
abs_path: str,
|
||||
user_metadata: UserMetadata = None,
|
||||
extra_tags: Sequence[str] = (),
|
||||
owner_id: str = "",
|
||||
job_id: str | None = None,
|
||||
) -> bool:
|
||||
"""Register an existing on-disk file as an asset stub.
|
||||
|
||||
If a reference already exists for this path, updates mtime_ns, job_id,
|
||||
size_bytes, and resets enrichment so the enricher will re-hash it.
|
||||
|
||||
For brand-new paths, inserts a stub record (hash=NULL) for immediate
|
||||
UX visibility.
|
||||
|
||||
Returns True if a row was inserted or updated, False otherwise.
|
||||
"""
|
||||
locator = os.path.abspath(abs_path)
|
||||
size_bytes, mtime_ns = get_size_and_mtime_ns(abs_path)
|
||||
mime_type = mimetypes.guess_type(abs_path, strict=False)[0]
|
||||
name, path_tags = get_name_and_tags_from_asset_path(abs_path)
|
||||
tags = list(dict.fromkeys(path_tags + list(extra_tags)))
|
||||
|
||||
with create_session() as session:
|
||||
existing_ref = get_reference_by_file_path(session, locator)
|
||||
if existing_ref is not None:
|
||||
now = get_utc_now()
|
||||
existing_ref.mtime_ns = mtime_ns
|
||||
existing_ref.job_id = job_id
|
||||
existing_ref.is_missing = False
|
||||
existing_ref.deleted_at = None
|
||||
existing_ref.updated_at = now
|
||||
existing_ref.enrichment_level = 0
|
||||
|
||||
asset = existing_ref.asset
|
||||
if asset:
|
||||
# If other refs share this asset, detach to a new stub
|
||||
# instead of mutating the shared row.
|
||||
siblings = count_active_siblings(session, asset.id, existing_ref.id)
|
||||
if siblings > 0:
|
||||
new_asset = create_stub_asset(
|
||||
session,
|
||||
size_bytes=size_bytes,
|
||||
mime_type=mime_type or asset.mime_type,
|
||||
)
|
||||
existing_ref.asset_id = new_asset.id
|
||||
else:
|
||||
asset.hash = None
|
||||
asset.size_bytes = size_bytes
|
||||
if mime_type:
|
||||
asset.mime_type = mime_type
|
||||
session.commit()
|
||||
return True
|
||||
|
||||
spec = {
|
||||
"abs_path": abs_path,
|
||||
"size_bytes": size_bytes,
|
||||
"mtime_ns": mtime_ns,
|
||||
"info_name": name,
|
||||
"tags": tags,
|
||||
"fname": os.path.basename(abs_path),
|
||||
"metadata": None,
|
||||
"hash": None,
|
||||
"mime_type": mime_type,
|
||||
"job_id": job_id,
|
||||
}
|
||||
if tags:
|
||||
ensure_tags_exist(session, tags)
|
||||
result = batch_insert_seed_assets(session, [spec], owner_id=owner_id)
|
||||
session.commit()
|
||||
return result.won_paths > 0
|
||||
|
||||
|
||||
def _register_existing_asset(
|
||||
asset_hash: str,
|
||||
name: str,
|
||||
|
|
|
|||
|
|
@ -93,12 +93,13 @@ def compute_relative_filename(file_path: str) -> str | None:
|
|||
|
||||
def get_asset_category_and_relative_path(
|
||||
file_path: str,
|
||||
) -> tuple[Literal["input", "output", "models"], str]:
|
||||
) -> tuple[Literal["input", "output", "temp", "models"], str]:
|
||||
"""Determine which root category a file path belongs to.
|
||||
|
||||
Categories:
|
||||
- 'input': under folder_paths.get_input_directory()
|
||||
- 'output': under folder_paths.get_output_directory()
|
||||
- 'temp': under folder_paths.get_temp_directory()
|
||||
- 'models': under any base path from get_comfy_models_folders()
|
||||
|
||||
Returns:
|
||||
|
|
@ -129,7 +130,12 @@ def get_asset_category_and_relative_path(
|
|||
if _check_is_within(fp_abs, output_base):
|
||||
return "output", _compute_relative(fp_abs, output_base)
|
||||
|
||||
# 3) models (check deepest matching base to avoid ambiguity)
|
||||
# 3) temp
|
||||
temp_base = os.path.abspath(folder_paths.get_temp_directory())
|
||||
if _check_is_within(fp_abs, temp_base):
|
||||
return "temp", _compute_relative(fp_abs, temp_base)
|
||||
|
||||
# 4) models (check deepest matching base to avoid ambiguity)
|
||||
best: tuple[int, str, str] | None = None # (base_len, bucket, rel_inside_bucket)
|
||||
for bucket, bases in get_comfy_models_folders():
|
||||
for b in bases:
|
||||
|
|
@ -146,7 +152,7 @@ def get_asset_category_and_relative_path(
|
|||
return "models", os.path.relpath(os.path.join(os.sep, combined), os.sep)
|
||||
|
||||
raise ValueError(
|
||||
f"Path is not within input, output, or configured model bases: {file_path}"
|
||||
f"Path is not within input, output, temp, or configured model bases: {file_path}"
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,90 @@
|
|||
#version 300 es
|
||||
precision highp float;
|
||||
|
||||
uniform sampler2D u_image0;
|
||||
uniform float u_float0;
|
||||
uniform float u_float1;
|
||||
uniform float u_float2;
|
||||
uniform float u_float3;
|
||||
uniform float u_float4;
|
||||
uniform float u_float5;
|
||||
uniform float u_float6;
|
||||
uniform float u_float7;
|
||||
uniform float u_float8;
|
||||
uniform bool u_bool0;
|
||||
|
||||
in vec2 v_texCoord;
|
||||
out vec4 fragColor;
|
||||
|
||||
vec3 rgb2hsl(vec3 c) {
|
||||
float maxC = max(c.r, max(c.g, c.b));
|
||||
float minC = min(c.r, min(c.g, c.b));
|
||||
float l = (maxC + minC) * 0.5;
|
||||
if (maxC == minC) return vec3(0.0, 0.0, l);
|
||||
float d = maxC - minC;
|
||||
float s = l > 0.5 ? d / (2.0 - maxC - minC) : d / (maxC + minC);
|
||||
float h;
|
||||
if (maxC == c.r) {
|
||||
h = (c.g - c.b) / d + (c.g < c.b ? 6.0 : 0.0);
|
||||
} else if (maxC == c.g) {
|
||||
h = (c.b - c.r) / d + 2.0;
|
||||
} else {
|
||||
h = (c.r - c.g) / d + 4.0;
|
||||
}
|
||||
h /= 6.0;
|
||||
return vec3(h, s, l);
|
||||
}
|
||||
|
||||
float hue2rgb(float p, float q, float t) {
|
||||
if (t < 0.0) t += 1.0;
|
||||
if (t > 1.0) t -= 1.0;
|
||||
if (t < 1.0 / 6.0) return p + (q - p) * 6.0 * t;
|
||||
if (t < 1.0 / 2.0) return q;
|
||||
if (t < 2.0 / 3.0) return p + (q - p) * (2.0 / 3.0 - t) * 6.0;
|
||||
return p;
|
||||
}
|
||||
|
||||
vec3 hsl2rgb(vec3 hsl) {
|
||||
float h = hsl.x, s = hsl.y, l = hsl.z;
|
||||
if (s == 0.0) return vec3(l);
|
||||
float q = l < 0.5 ? l * (1.0 + s) : l + s - l * s;
|
||||
float p = 2.0 * l - q;
|
||||
return vec3(
|
||||
hue2rgb(p, q, h + 1.0 / 3.0),
|
||||
hue2rgb(p, q, h),
|
||||
hue2rgb(p, q, h - 1.0 / 3.0)
|
||||
);
|
||||
}
|
||||
|
||||
void main() {
|
||||
vec4 tex = texture(u_image0, v_texCoord);
|
||||
vec3 color = tex.rgb;
|
||||
|
||||
vec3 shadows = vec3(u_float0, u_float1, u_float2) * 0.01;
|
||||
vec3 midtones = vec3(u_float3, u_float4, u_float5) * 0.01;
|
||||
vec3 highlights = vec3(u_float6, u_float7, u_float8) * 0.01;
|
||||
|
||||
float maxC = max(color.r, max(color.g, color.b));
|
||||
float minC = min(color.r, min(color.g, color.b));
|
||||
float lightness = (maxC + minC) * 0.5;
|
||||
|
||||
// GIMP weight curves: linear ramps with constants a=0.25, b=0.333, scale=0.7
|
||||
const float a = 0.25;
|
||||
const float b = 0.333;
|
||||
const float scale = 0.7;
|
||||
|
||||
float sw = clamp((lightness - b) / -a + 0.5, 0.0, 1.0) * scale;
|
||||
float mw = clamp((lightness - b) / a + 0.5, 0.0, 1.0) *
|
||||
clamp((lightness + b - 1.0) / -a + 0.5, 0.0, 1.0) * scale;
|
||||
float hw = clamp((lightness + b - 1.0) / a + 0.5, 0.0, 1.0) * scale;
|
||||
|
||||
color += sw * shadows + mw * midtones + hw * highlights;
|
||||
|
||||
if (u_bool0) {
|
||||
vec3 hsl = rgb2hsl(clamp(color, 0.0, 1.0));
|
||||
hsl.z = lightness;
|
||||
color = hsl2rgb(hsl);
|
||||
}
|
||||
|
||||
fragColor = vec4(clamp(color, 0.0, 1.0), tex.a);
|
||||
}
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
#version 300 es
|
||||
precision highp float;
|
||||
|
||||
uniform sampler2D u_image0;
|
||||
uniform sampler2D u_curve0; // RGB master curve (256x1 LUT)
|
||||
uniform sampler2D u_curve1; // Red channel curve
|
||||
uniform sampler2D u_curve2; // Green channel curve
|
||||
uniform sampler2D u_curve3; // Blue channel curve
|
||||
|
||||
in vec2 v_texCoord;
|
||||
layout(location = 0) out vec4 fragColor0;
|
||||
|
||||
// GIMP-compatible curve lookup with manual linear interpolation.
|
||||
// Matches gimp_curve_map_value_inline() from gimpcurve-map.c:
|
||||
// index = value * (n_samples - 1)
|
||||
// f = fract(index)
|
||||
// result = (1-f) * samples[floor] + f * samples[ceil]
|
||||
//
|
||||
// Uses texelFetch (NEAREST) to avoid GPU half-texel offset issues
|
||||
// that occur with texture() + GL_LINEAR on small 256x1 LUTs.
|
||||
float applyCurve(sampler2D curve, float value) {
|
||||
value = clamp(value, 0.0, 1.0);
|
||||
|
||||
float pos = value * 255.0;
|
||||
int lo = int(floor(pos));
|
||||
int hi = min(lo + 1, 255);
|
||||
float f = pos - float(lo);
|
||||
|
||||
float a = texelFetch(curve, ivec2(lo, 0), 0).r;
|
||||
float b = texelFetch(curve, ivec2(hi, 0), 0).r;
|
||||
|
||||
return a + f * (b - a);
|
||||
}
|
||||
|
||||
void main() {
|
||||
vec4 color = texture(u_image0, v_texCoord);
|
||||
|
||||
// GIMP order: per-channel curves first, then RGB master curve.
|
||||
// See gimp_curve_map_pixels() default case in gimpcurve-map.c:
|
||||
// dest = colors_curve( channel_curve( src ) )
|
||||
float tmp_r = applyCurve(u_curve1, color.r);
|
||||
float tmp_g = applyCurve(u_curve2, color.g);
|
||||
float tmp_b = applyCurve(u_curve3, color.b);
|
||||
color.r = applyCurve(u_curve0, tmp_r);
|
||||
color.g = applyCurve(u_curve0, tmp_g);
|
||||
color.b = applyCurve(u_curve0, tmp_b);
|
||||
|
||||
fragColor0 = vec4(color.rgb, color.a);
|
||||
}
|
||||
|
|
@ -2,7 +2,6 @@
|
|||
precision mediump float;
|
||||
|
||||
uniform sampler2D u_image0;
|
||||
uniform vec2 u_resolution;
|
||||
uniform int u_int0; // Blend mode
|
||||
uniform int u_int1; // Color tint
|
||||
uniform float u_float0; // Intensity
|
||||
|
|
@ -75,7 +74,7 @@ void main() {
|
|||
float t0 = threshold - 0.15;
|
||||
float t1 = threshold + 0.15;
|
||||
|
||||
vec2 texelSize = 1.0 / u_resolution;
|
||||
vec2 texelSize = 1.0 / vec2(textureSize(u_image0, 0));
|
||||
float radius2 = radius * radius;
|
||||
|
||||
float sampleScale = clamp(radius * 0.75, 0.35, 1.0);
|
||||
|
|
|
|||
|
|
@ -12,7 +12,6 @@ const int RADIAL_SAMPLES = 12;
|
|||
const float RADIAL_STRENGTH = 0.0003;
|
||||
|
||||
uniform sampler2D u_image0;
|
||||
uniform vec2 u_resolution;
|
||||
uniform int u_int0; // Blur type (BLUR_GAUSSIAN, BLUR_BOX, BLUR_RADIAL)
|
||||
uniform float u_float0; // Blur radius/amount
|
||||
uniform int u_pass; // Pass index (0 = horizontal, 1 = vertical)
|
||||
|
|
@ -25,7 +24,7 @@ float gaussian(float x, float sigma) {
|
|||
}
|
||||
|
||||
void main() {
|
||||
vec2 texelSize = 1.0 / u_resolution;
|
||||
vec2 texelSize = 1.0 / vec2(textureSize(u_image0, 0));
|
||||
float radius = max(u_float0, 0.0);
|
||||
|
||||
// Radial (angular) blur - single pass, doesn't use separable
|
||||
|
|
|
|||
|
|
@ -2,14 +2,13 @@
|
|||
precision highp float;
|
||||
|
||||
uniform sampler2D u_image0;
|
||||
uniform vec2 u_resolution;
|
||||
uniform float u_float0; // strength [0.0 – 2.0] typical: 0.3–1.0
|
||||
|
||||
in vec2 v_texCoord;
|
||||
layout(location = 0) out vec4 fragColor0;
|
||||
|
||||
void main() {
|
||||
vec2 texel = 1.0 / u_resolution;
|
||||
vec2 texel = 1.0 / vec2(textureSize(u_image0, 0));
|
||||
|
||||
// Sample center and neighbors
|
||||
vec4 center = texture(u_image0, v_texCoord);
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@
|
|||
precision highp float;
|
||||
|
||||
uniform sampler2D u_image0;
|
||||
uniform vec2 u_resolution;
|
||||
uniform float u_float0; // amount [0.0 - 3.0] typical: 0.5-1.5
|
||||
uniform float u_float1; // radius [0.5 - 10.0] blur radius in pixels
|
||||
uniform float u_float2; // threshold [0.0 - 0.1] min difference to sharpen
|
||||
|
|
@ -19,7 +18,7 @@ float getLuminance(vec3 color) {
|
|||
}
|
||||
|
||||
void main() {
|
||||
vec2 texel = 1.0 / u_resolution;
|
||||
vec2 texel = 1.0 / vec2(textureSize(u_image0, 0));
|
||||
float radius = max(u_float1, 0.5);
|
||||
float amount = u_float0;
|
||||
float threshold = u_float2;
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,615 @@
|
|||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 10,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 10,
|
||||
"type": "d5c462c8-1372-4af8-84f2-547c83470d04",
|
||||
"pos": [
|
||||
3610,
|
||||
-2630
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
420
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "image",
|
||||
"localized_name": "images.image0",
|
||||
"name": "images.image0",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"label": "IMAGE",
|
||||
"localized_name": "IMAGE0",
|
||||
"name": "IMAGE0",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"4",
|
||||
"curve"
|
||||
],
|
||||
[
|
||||
"5",
|
||||
"curve"
|
||||
],
|
||||
[
|
||||
"6",
|
||||
"curve"
|
||||
],
|
||||
[
|
||||
"7",
|
||||
"curve"
|
||||
]
|
||||
]
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Color Curves"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "d5c462c8-1372-4af8-84f2-547c83470d04",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 9,
|
||||
"lastLinkId": 38,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Color Curves",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
2660,
|
||||
-4500,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
4270,
|
||||
-4500,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "abc345b7-f55e-4f32-a11d-3aa4c2b0936b",
|
||||
"name": "images.image0",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
29,
|
||||
34
|
||||
],
|
||||
"localized_name": "images.image0",
|
||||
"label": "image",
|
||||
"pos": [
|
||||
2760,
|
||||
-4480
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "eb0ec079-46da-4408-8263-9ef85569d33d",
|
||||
"name": "IMAGE0",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
28
|
||||
],
|
||||
"localized_name": "IMAGE0",
|
||||
"label": "IMAGE",
|
||||
"pos": [
|
||||
4290,
|
||||
-4480
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 4,
|
||||
"type": "CurveEditor",
|
||||
"pos": [
|
||||
3060,
|
||||
-4500
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "curve",
|
||||
"localized_name": "curve",
|
||||
"name": "curve",
|
||||
"type": "CURVE",
|
||||
"widget": {
|
||||
"name": "curve"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "histogram",
|
||||
"localized_name": "histogram",
|
||||
"name": "histogram",
|
||||
"type": "HISTOGRAM",
|
||||
"shape": 7,
|
||||
"link": 35
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "CURVE",
|
||||
"name": "CURVE",
|
||||
"type": "CURVE",
|
||||
"links": [
|
||||
30
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "RGB Master",
|
||||
"properties": {
|
||||
"Node name for S&R": "CurveEditor"
|
||||
},
|
||||
"widgets_values": []
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "CurveEditor",
|
||||
"pos": [
|
||||
3060,
|
||||
-4250
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "curve",
|
||||
"localized_name": "curve",
|
||||
"name": "curve",
|
||||
"type": "CURVE",
|
||||
"widget": {
|
||||
"name": "curve"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "histogram",
|
||||
"localized_name": "histogram",
|
||||
"name": "histogram",
|
||||
"type": "HISTOGRAM",
|
||||
"shape": 7,
|
||||
"link": 36
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "CURVE",
|
||||
"name": "CURVE",
|
||||
"type": "CURVE",
|
||||
"links": [
|
||||
31
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Red",
|
||||
"properties": {
|
||||
"Node name for S&R": "CurveEditor"
|
||||
},
|
||||
"widgets_values": []
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "CurveEditor",
|
||||
"pos": [
|
||||
3060,
|
||||
-4000
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "curve",
|
||||
"localized_name": "curve",
|
||||
"name": "curve",
|
||||
"type": "CURVE",
|
||||
"widget": {
|
||||
"name": "curve"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "histogram",
|
||||
"localized_name": "histogram",
|
||||
"name": "histogram",
|
||||
"type": "HISTOGRAM",
|
||||
"shape": 7,
|
||||
"link": 37
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "CURVE",
|
||||
"name": "CURVE",
|
||||
"type": "CURVE",
|
||||
"links": [
|
||||
32
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Green",
|
||||
"properties": {
|
||||
"Node name for S&R": "CurveEditor"
|
||||
},
|
||||
"widgets_values": []
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"type": "CurveEditor",
|
||||
"pos": [
|
||||
3060,
|
||||
-3750
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "curve",
|
||||
"localized_name": "curve",
|
||||
"name": "curve",
|
||||
"type": "CURVE",
|
||||
"widget": {
|
||||
"name": "curve"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "histogram",
|
||||
"localized_name": "histogram",
|
||||
"name": "histogram",
|
||||
"type": "HISTOGRAM",
|
||||
"shape": 7,
|
||||
"link": 38
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "CURVE",
|
||||
"name": "CURVE",
|
||||
"type": "CURVE",
|
||||
"links": [
|
||||
33
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Blue",
|
||||
"properties": {
|
||||
"Node name for S&R": "CurveEditor"
|
||||
},
|
||||
"widgets_values": []
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "GLSLShader",
|
||||
"pos": [
|
||||
3590,
|
||||
-4500
|
||||
],
|
||||
"size": [
|
||||
420,
|
||||
500
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "image0",
|
||||
"localized_name": "images.image0",
|
||||
"name": "images.image0",
|
||||
"type": "IMAGE",
|
||||
"link": 29
|
||||
},
|
||||
{
|
||||
"label": "image1",
|
||||
"localized_name": "images.image1",
|
||||
"name": "images.image1",
|
||||
"shape": 7,
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "u_curve0",
|
||||
"localized_name": "curves.u_curve0",
|
||||
"name": "curves.u_curve0",
|
||||
"shape": 7,
|
||||
"type": "CURVE",
|
||||
"link": 30
|
||||
},
|
||||
{
|
||||
"label": "u_curve1",
|
||||
"localized_name": "curves.u_curve1",
|
||||
"name": "curves.u_curve1",
|
||||
"shape": 7,
|
||||
"type": "CURVE",
|
||||
"link": 31
|
||||
},
|
||||
{
|
||||
"label": "u_curve2",
|
||||
"localized_name": "curves.u_curve2",
|
||||
"name": "curves.u_curve2",
|
||||
"shape": 7,
|
||||
"type": "CURVE",
|
||||
"link": 32
|
||||
},
|
||||
{
|
||||
"label": "u_curve3",
|
||||
"localized_name": "curves.u_curve3",
|
||||
"name": "curves.u_curve3",
|
||||
"shape": 7,
|
||||
"type": "CURVE",
|
||||
"link": 33
|
||||
},
|
||||
{
|
||||
"localized_name": "fragment_shader",
|
||||
"name": "fragment_shader",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "fragment_shader"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "size_mode",
|
||||
"name": "size_mode",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"widget": {
|
||||
"name": "size_mode"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE0",
|
||||
"name": "IMAGE0",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
28
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "IMAGE1",
|
||||
"name": "IMAGE1",
|
||||
"type": "IMAGE",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "IMAGE2",
|
||||
"name": "IMAGE2",
|
||||
"type": "IMAGE",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "IMAGE3",
|
||||
"name": "IMAGE3",
|
||||
"type": "IMAGE",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GLSLShader"
|
||||
},
|
||||
"widgets_values": [
|
||||
"#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\nuniform sampler2D u_curve0; // RGB master curve (256x1 LUT)\nuniform sampler2D u_curve1; // Red channel curve\nuniform sampler2D u_curve2; // Green channel curve\nuniform sampler2D u_curve3; // Blue channel curve\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\n// GIMP-compatible curve lookup with manual linear interpolation.\n// Matches gimp_curve_map_value_inline() from gimpcurve-map.c:\n// index = value * (n_samples - 1)\n// f = fract(index)\n// result = (1-f) * samples[floor] + f * samples[ceil]\n//\n// Uses texelFetch (NEAREST) to avoid GPU half-texel offset issues\n// that occur with texture() + GL_LINEAR on small 256x1 LUTs.\nfloat applyCurve(sampler2D curve, float value) {\n value = clamp(value, 0.0, 1.0);\n\n float pos = value * 255.0;\n int lo = int(floor(pos));\n int hi = min(lo + 1, 255);\n float f = pos - float(lo);\n\n float a = texelFetch(curve, ivec2(lo, 0), 0).r;\n float b = texelFetch(curve, ivec2(hi, 0), 0).r;\n\n return a + f * (b - a);\n}\n\nvoid main() {\n vec4 color = texture(u_image0, v_texCoord);\n\n // GIMP order: per-channel curves first, then RGB master curve.\n // See gimp_curve_map_pixels() default case in gimpcurve-map.c:\n // dest = colors_curve( channel_curve( src ) )\n float tmp_r = applyCurve(u_curve1, color.r);\n float tmp_g = applyCurve(u_curve2, color.g);\n float tmp_b = applyCurve(u_curve3, color.b);\n color.r = applyCurve(u_curve0, tmp_r);\n color.g = applyCurve(u_curve0, tmp_g);\n color.b = applyCurve(u_curve0, tmp_b);\n\n fragColor0 = vec4(color.rgb, color.a);\n}\n",
|
||||
"from_input"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"type": "ImageHistogram",
|
||||
"pos": [
|
||||
2800,
|
||||
-4300
|
||||
],
|
||||
"size": [
|
||||
210,
|
||||
150
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "image",
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 34
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "HISTOGRAM",
|
||||
"name": "rgb",
|
||||
"type": "HISTOGRAM",
|
||||
"links": [
|
||||
35
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "HISTOGRAM",
|
||||
"name": "luminance",
|
||||
"type": "HISTOGRAM",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"localized_name": "HISTOGRAM",
|
||||
"name": "red",
|
||||
"type": "HISTOGRAM",
|
||||
"links": [
|
||||
36
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "HISTOGRAM",
|
||||
"name": "green",
|
||||
"type": "HISTOGRAM",
|
||||
"links": [
|
||||
37
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "HISTOGRAM",
|
||||
"name": "blue",
|
||||
"type": "HISTOGRAM",
|
||||
"links": [
|
||||
38
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ImageHistogram"
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 29,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 8,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 28,
|
||||
"origin_id": 8,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"origin_id": 4,
|
||||
"origin_slot": 0,
|
||||
"target_id": 8,
|
||||
"target_slot": 2,
|
||||
"type": "CURVE"
|
||||
},
|
||||
{
|
||||
"id": 31,
|
||||
"origin_id": 5,
|
||||
"origin_slot": 0,
|
||||
"target_id": 8,
|
||||
"target_slot": 3,
|
||||
"type": "CURVE"
|
||||
},
|
||||
{
|
||||
"id": 32,
|
||||
"origin_id": 6,
|
||||
"origin_slot": 0,
|
||||
"target_id": 8,
|
||||
"target_slot": 4,
|
||||
"type": "CURVE"
|
||||
},
|
||||
{
|
||||
"id": 33,
|
||||
"origin_id": 7,
|
||||
"origin_slot": 0,
|
||||
"target_id": 8,
|
||||
"target_slot": 5,
|
||||
"type": "CURVE"
|
||||
},
|
||||
{
|
||||
"id": 34,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 9,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 35,
|
||||
"origin_id": 9,
|
||||
"origin_slot": 0,
|
||||
"target_id": 4,
|
||||
"target_slot": 1,
|
||||
"type": "HISTOGRAM"
|
||||
},
|
||||
{
|
||||
"id": 36,
|
||||
"origin_id": 9,
|
||||
"origin_slot": 2,
|
||||
"target_id": 5,
|
||||
"target_slot": 1,
|
||||
"type": "HISTOGRAM"
|
||||
},
|
||||
{
|
||||
"id": 37,
|
||||
"origin_id": 9,
|
||||
"origin_slot": 3,
|
||||
"target_id": 6,
|
||||
"target_slot": 1,
|
||||
"type": "HISTOGRAM"
|
||||
},
|
||||
{
|
||||
"id": 38,
|
||||
"origin_id": 9,
|
||||
"origin_slot": 4,
|
||||
"target_id": 7,
|
||||
"target_slot": 1,
|
||||
"type": "HISTOGRAM"
|
||||
}
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -1 +1,322 @@
|
|||
{"revision": 0, "last_node_id": 29, "last_link_id": 0, "nodes": [{"id": 29, "type": "4c9d6ea4-b912-40e5-8766-6793a9758c53", "pos": [1970, -230], "size": [180, 86], "flags": {}, "order": 5, "mode": 0, "inputs": [{"label": "image", "localized_name": "images.image0", "name": "images.image0", "type": "IMAGE", "link": null}], "outputs": [{"label": "R", "localized_name": "IMAGE0", "name": "IMAGE0", "type": "IMAGE", "links": []}, {"label": "G", "localized_name": "IMAGE1", "name": "IMAGE1", "type": "IMAGE", "links": []}, {"label": "B", "localized_name": "IMAGE2", "name": "IMAGE2", "type": "IMAGE", "links": []}, {"label": "A", "localized_name": "IMAGE3", "name": "IMAGE3", "type": "IMAGE", "links": []}], "title": "Image Channels", "properties": {"proxyWidgets": []}, "widgets_values": []}], "links": [], "version": 0.4, "definitions": {"subgraphs": [{"id": "4c9d6ea4-b912-40e5-8766-6793a9758c53", "version": 1, "state": {"lastGroupId": 0, "lastNodeId": 28, "lastLinkId": 39, "lastRerouteId": 0}, "revision": 0, "config": {}, "name": "Image Channels", "inputNode": {"id": -10, "bounding": [1820, -185, 120, 60]}, "outputNode": {"id": -20, "bounding": [2460, -215, 120, 120]}, "inputs": [{"id": "3522932b-2d86-4a1f-a02a-cb29f3a9d7fe", "name": "images.image0", "type": "IMAGE", "linkIds": [39], "localized_name": "images.image0", "label": "image", "pos": [1920, -165]}], "outputs": [{"id": "605cb9c3-b065-4d9b-81d2-3ec331889b2b", "name": "IMAGE0", "type": "IMAGE", "linkIds": [26], "localized_name": "IMAGE0", "label": "R", "pos": [2480, -195]}, {"id": "fb44a77e-0522-43e9-9527-82e7465b3596", "name": "IMAGE1", "type": "IMAGE", "linkIds": [27], "localized_name": "IMAGE1", "label": "G", "pos": [2480, -175]}, {"id": "81460ee6-0131-402a-874f-6bf3001fc4ff", "name": "IMAGE2", "type": "IMAGE", "linkIds": [28], "localized_name": "IMAGE2", "label": "B", "pos": [2480, -155]}, {"id": "ae690246-80d4-4951-b1d9-9306d8a77417", "name": "IMAGE3", "type": "IMAGE", "linkIds": [29], "localized_name": "IMAGE3", "label": "A", "pos": [2480, -135]}], "widgets": [], "nodes": [{"id": 23, "type": "GLSLShader", "pos": [2000, -330], "size": [400, 172], "flags": {}, "order": 0, "mode": 0, "inputs": [{"label": "image", "localized_name": "images.image0", "name": "images.image0", "type": "IMAGE", "link": 39}, {"localized_name": "fragment_shader", "name": "fragment_shader", "type": "STRING", "widget": {"name": "fragment_shader"}, "link": null}, {"localized_name": "size_mode", "name": "size_mode", "type": "COMFY_DYNAMICCOMBO_V3", "widget": {"name": "size_mode"}, "link": null}, {"label": "image1", "localized_name": "images.image1", "name": "images.image1", "shape": 7, "type": "IMAGE", "link": null}], "outputs": [{"label": "R", "localized_name": "IMAGE0", "name": "IMAGE0", "type": "IMAGE", "links": [26]}, {"label": "G", "localized_name": "IMAGE1", "name": "IMAGE1", "type": "IMAGE", "links": [27]}, {"label": "B", "localized_name": "IMAGE2", "name": "IMAGE2", "type": "IMAGE", "links": [28]}, {"label": "A", "localized_name": "IMAGE3", "name": "IMAGE3", "type": "IMAGE", "links": [29]}], "properties": {"Node name for S&R": "GLSLShader"}, "widgets_values": ["#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\nlayout(location = 1) out vec4 fragColor1;\nlayout(location = 2) out vec4 fragColor2;\nlayout(location = 3) out vec4 fragColor3;\n\nvoid main() {\n vec4 color = texture(u_image0, v_texCoord);\n // Output each channel as grayscale to separate render targets\n fragColor0 = vec4(vec3(color.r), 1.0); // Red channel\n fragColor1 = vec4(vec3(color.g), 1.0); // Green channel\n fragColor2 = vec4(vec3(color.b), 1.0); // Blue channel\n fragColor3 = vec4(vec3(color.a), 1.0); // Alpha channel\n}\n", "from_input"]}], "groups": [], "links": [{"id": 39, "origin_id": -10, "origin_slot": 0, "target_id": 23, "target_slot": 0, "type": "IMAGE"}, {"id": 26, "origin_id": 23, "origin_slot": 0, "target_id": -20, "target_slot": 0, "type": "IMAGE"}, {"id": 27, "origin_id": 23, "origin_slot": 1, "target_id": -20, "target_slot": 1, "type": "IMAGE"}, {"id": 28, "origin_id": 23, "origin_slot": 2, "target_id": -20, "target_slot": 2, "type": "IMAGE"}, {"id": 29, "origin_id": 23, "origin_slot": 3, "target_id": -20, "target_slot": 3, "type": "IMAGE"}], "extra": {"workflowRendererVersion": "LG"}, "category": "Image Tools/Color adjust"}]}}
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 29,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 29,
|
||||
"type": "4c9d6ea4-b912-40e5-8766-6793a9758c53",
|
||||
"pos": [
|
||||
1970,
|
||||
-230
|
||||
],
|
||||
"size": [
|
||||
180,
|
||||
86
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "image",
|
||||
"localized_name": "images.image0",
|
||||
"name": "images.image0",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"label": "R",
|
||||
"localized_name": "IMAGE0",
|
||||
"name": "IMAGE0",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"label": "G",
|
||||
"localized_name": "IMAGE1",
|
||||
"name": "IMAGE1",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"label": "B",
|
||||
"localized_name": "IMAGE2",
|
||||
"name": "IMAGE2",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"label": "A",
|
||||
"localized_name": "IMAGE3",
|
||||
"name": "IMAGE3",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"title": "Image Channels",
|
||||
"properties": {
|
||||
"proxyWidgets": []
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "4c9d6ea4-b912-40e5-8766-6793a9758c53",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 28,
|
||||
"lastLinkId": 39,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Image Channels",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
1820,
|
||||
-185,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
2460,
|
||||
-215,
|
||||
120,
|
||||
120
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "3522932b-2d86-4a1f-a02a-cb29f3a9d7fe",
|
||||
"name": "images.image0",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
39
|
||||
],
|
||||
"localized_name": "images.image0",
|
||||
"label": "image",
|
||||
"pos": [
|
||||
1920,
|
||||
-165
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "605cb9c3-b065-4d9b-81d2-3ec331889b2b",
|
||||
"name": "IMAGE0",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
26
|
||||
],
|
||||
"localized_name": "IMAGE0",
|
||||
"label": "R",
|
||||
"pos": [
|
||||
2480,
|
||||
-195
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "fb44a77e-0522-43e9-9527-82e7465b3596",
|
||||
"name": "IMAGE1",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
27
|
||||
],
|
||||
"localized_name": "IMAGE1",
|
||||
"label": "G",
|
||||
"pos": [
|
||||
2480,
|
||||
-175
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "81460ee6-0131-402a-874f-6bf3001fc4ff",
|
||||
"name": "IMAGE2",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
28
|
||||
],
|
||||
"localized_name": "IMAGE2",
|
||||
"label": "B",
|
||||
"pos": [
|
||||
2480,
|
||||
-155
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "ae690246-80d4-4951-b1d9-9306d8a77417",
|
||||
"name": "IMAGE3",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
29
|
||||
],
|
||||
"localized_name": "IMAGE3",
|
||||
"label": "A",
|
||||
"pos": [
|
||||
2480,
|
||||
-135
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 23,
|
||||
"type": "GLSLShader",
|
||||
"pos": [
|
||||
2000,
|
||||
-330
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
172
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "image",
|
||||
"localized_name": "images.image0",
|
||||
"name": "images.image0",
|
||||
"type": "IMAGE",
|
||||
"link": 39
|
||||
},
|
||||
{
|
||||
"localized_name": "fragment_shader",
|
||||
"name": "fragment_shader",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "fragment_shader"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "size_mode",
|
||||
"name": "size_mode",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"widget": {
|
||||
"name": "size_mode"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "image1",
|
||||
"localized_name": "images.image1",
|
||||
"name": "images.image1",
|
||||
"shape": 7,
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"label": "R",
|
||||
"localized_name": "IMAGE0",
|
||||
"name": "IMAGE0",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
26
|
||||
]
|
||||
},
|
||||
{
|
||||
"label": "G",
|
||||
"localized_name": "IMAGE1",
|
||||
"name": "IMAGE1",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
27
|
||||
]
|
||||
},
|
||||
{
|
||||
"label": "B",
|
||||
"localized_name": "IMAGE2",
|
||||
"name": "IMAGE2",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
28
|
||||
]
|
||||
},
|
||||
{
|
||||
"label": "A",
|
||||
"localized_name": "IMAGE3",
|
||||
"name": "IMAGE3",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
29
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GLSLShader"
|
||||
},
|
||||
"widgets_values": [
|
||||
"#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\nlayout(location = 1) out vec4 fragColor1;\nlayout(location = 2) out vec4 fragColor2;\nlayout(location = 3) out vec4 fragColor3;\n\nvoid main() {\n vec4 color = texture(u_image0, v_texCoord);\n // Output each channel as grayscale to separate render targets\n fragColor0 = vec4(vec3(color.r), 1.0); // Red channel\n fragColor1 = vec4(vec3(color.g), 1.0); // Green channel\n fragColor2 = vec4(vec3(color.b), 1.0); // Blue channel\n fragColor3 = vec4(vec3(color.a), 1.0); // Alpha channel\n}\n",
|
||||
"from_input"
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 39,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 23,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 26,
|
||||
"origin_id": 23,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 27,
|
||||
"origin_id": 23,
|
||||
"origin_slot": 1,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 28,
|
||||
"origin_id": 23,
|
||||
"origin_slot": 2,
|
||||
"target_id": -20,
|
||||
"target_slot": 2,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 29,
|
||||
"origin_id": 23,
|
||||
"origin_slot": 3,
|
||||
"target_id": -20,
|
||||
"target_slot": 3,
|
||||
"type": "IMAGE"
|
||||
}
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -1 +1,278 @@
|
|||
{"revision": 0, "last_node_id": 15, "last_link_id": 0, "nodes": [{"id": 15, "type": "24d8bbfd-39d4-4774-bff0-3de40cc7a471", "pos": [-1490, 2040], "size": [400, 260], "flags": {}, "order": 0, "mode": 0, "inputs": [{"name": "prompt", "type": "STRING", "widget": {"name": "prompt"}, "link": null}, {"label": "reference images", "name": "images", "type": "IMAGE", "link": null}], "outputs": [{"name": "STRING", "type": "STRING", "links": null}], "title": "Prompt Enhance", "properties": {"proxyWidgets": [["-1", "prompt"]], "cnr_id": "comfy-core", "ver": "0.14.1"}, "widgets_values": [""]}], "links": [], "version": 0.4, "definitions": {"subgraphs": [{"id": "24d8bbfd-39d4-4774-bff0-3de40cc7a471", "version": 1, "state": {"lastGroupId": 0, "lastNodeId": 15, "lastLinkId": 14, "lastRerouteId": 0}, "revision": 0, "config": {}, "name": "Prompt Enhance", "inputNode": {"id": -10, "bounding": [-2170, 2110, 138.876953125, 80]}, "outputNode": {"id": -20, "bounding": [-640, 2110, 120, 60]}, "inputs": [{"id": "aeab7216-00e0-4528-a09b-bba50845c5a6", "name": "prompt", "type": "STRING", "linkIds": [11], "pos": [-2051.123046875, 2130]}, {"id": "7b73fd36-aa31-4771-9066-f6c83879994b", "name": "images", "type": "IMAGE", "linkIds": [14], "label": "reference images", "pos": [-2051.123046875, 2150]}], "outputs": [{"id": "c7b0d930-68a1-48d1-b496-0519e5837064", "name": "STRING", "type": "STRING", "linkIds": [13], "pos": [-620, 2130]}], "widgets": [], "nodes": [{"id": 11, "type": "GeminiNode", "pos": [-1560, 1990], "size": [470, 470], "flags": {}, "order": 0, "mode": 0, "inputs": [{"localized_name": "images", "name": "images", "shape": 7, "type": "IMAGE", "link": 14}, {"localized_name": "audio", "name": "audio", "shape": 7, "type": "AUDIO", "link": null}, {"localized_name": "video", "name": "video", "shape": 7, "type": "VIDEO", "link": null}, {"localized_name": "files", "name": "files", "shape": 7, "type": "GEMINI_INPUT_FILES", "link": null}, {"localized_name": "prompt", "name": "prompt", "type": "STRING", "widget": {"name": "prompt"}, "link": 11}, {"localized_name": "model", "name": "model", "type": "COMBO", "widget": {"name": "model"}, "link": null}, {"localized_name": "seed", "name": "seed", "type": "INT", "widget": {"name": "seed"}, "link": null}, {"localized_name": "system_prompt", "name": "system_prompt", "shape": 7, "type": "STRING", "widget": {"name": "system_prompt"}, "link": null}], "outputs": [{"localized_name": "STRING", "name": "STRING", "type": "STRING", "links": [13]}], "properties": {"cnr_id": "comfy-core", "ver": "0.14.1", "Node name for S&R": "GeminiNode"}, "widgets_values": ["", "gemini-3-pro-preview", 42, "randomize", "You are an expert in prompt writing.\nBased on the input, rewrite the user's input into a detailed prompt.\nincluding camera settings, lighting, composition, and style.\nReturn the prompt only"], "color": "#432", "bgcolor": "#653"}], "groups": [], "links": [{"id": 11, "origin_id": -10, "origin_slot": 0, "target_id": 11, "target_slot": 4, "type": "STRING"}, {"id": 13, "origin_id": 11, "origin_slot": 0, "target_id": -20, "target_slot": 0, "type": "STRING"}, {"id": 14, "origin_id": -10, "origin_slot": 1, "target_id": 11, "target_slot": 0, "type": "IMAGE"}], "extra": {"workflowRendererVersion": "LG"}, "category": "Text generation/Prompt enhance"}]}, "extra": {}}
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 15,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 15,
|
||||
"type": "24d8bbfd-39d4-4774-bff0-3de40cc7a471",
|
||||
"pos": [
|
||||
-1490,
|
||||
2040
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
260
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "prompt",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "prompt"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "reference images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "STRING",
|
||||
"type": "STRING",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"title": "Prompt Enhance",
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"-1",
|
||||
"prompt"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.14.1"
|
||||
},
|
||||
"widgets_values": [
|
||||
""
|
||||
]
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "24d8bbfd-39d4-4774-bff0-3de40cc7a471",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 15,
|
||||
"lastLinkId": 14,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Prompt Enhance",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-2170,
|
||||
2110,
|
||||
138.876953125,
|
||||
80
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-640,
|
||||
2110,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "aeab7216-00e0-4528-a09b-bba50845c5a6",
|
||||
"name": "prompt",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
11
|
||||
],
|
||||
"pos": [
|
||||
-2051.123046875,
|
||||
2130
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "7b73fd36-aa31-4771-9066-f6c83879994b",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
14
|
||||
],
|
||||
"label": "reference images",
|
||||
"pos": [
|
||||
-2051.123046875,
|
||||
2150
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "c7b0d930-68a1-48d1-b496-0519e5837064",
|
||||
"name": "STRING",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
13
|
||||
],
|
||||
"pos": [
|
||||
-620,
|
||||
2130
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 11,
|
||||
"type": "GeminiNode",
|
||||
"pos": [
|
||||
-1560,
|
||||
1990
|
||||
],
|
||||
"size": [
|
||||
470,
|
||||
470
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"shape": 7,
|
||||
"type": "IMAGE",
|
||||
"link": 14
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"shape": 7,
|
||||
"type": "AUDIO",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"shape": 7,
|
||||
"type": "VIDEO",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "files",
|
||||
"name": "files",
|
||||
"shape": 7,
|
||||
"type": "GEMINI_INPUT_FILES",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "prompt",
|
||||
"name": "prompt",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "prompt"
|
||||
},
|
||||
"link": 11
|
||||
},
|
||||
{
|
||||
"localized_name": "model",
|
||||
"name": "model",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "seed",
|
||||
"name": "seed",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "seed"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "system_prompt",
|
||||
"name": "system_prompt",
|
||||
"shape": 7,
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "system_prompt"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "STRING",
|
||||
"name": "STRING",
|
||||
"type": "STRING",
|
||||
"links": [
|
||||
13
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.14.1",
|
||||
"Node name for S&R": "GeminiNode"
|
||||
},
|
||||
"widgets_values": [
|
||||
"",
|
||||
"gemini-3-pro-preview",
|
||||
42,
|
||||
"randomize",
|
||||
"You are an expert in prompt writing.\nBased on the input, rewrite the user's input into a detailed prompt.\nincluding camera settings, lighting, composition, and style.\nReturn the prompt only"
|
||||
],
|
||||
"color": "#432",
|
||||
"bgcolor": "#653"
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 11,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 11,
|
||||
"target_slot": 4,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"origin_id": 11,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 11,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
}
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Text generation/Prompt enhance"
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1 +1,309 @@
|
|||
{"revision": 0, "last_node_id": 25, "last_link_id": 0, "nodes": [{"id": 25, "type": "621ba4e2-22a8-482d-a369-023753198b7b", "pos": [4610, -790], "size": [230, 58], "flags": {}, "order": 4, "mode": 0, "inputs": [{"label": "image", "localized_name": "images.image0", "name": "images.image0", "type": "IMAGE", "link": null}], "outputs": [{"label": "IMAGE", "localized_name": "IMAGE0", "name": "IMAGE0", "type": "IMAGE", "links": []}], "title": "Sharpen", "properties": {"proxyWidgets": [["24", "value"]]}, "widgets_values": []}], "links": [], "version": 0.4, "definitions": {"subgraphs": [{"id": "621ba4e2-22a8-482d-a369-023753198b7b", "version": 1, "state": {"lastGroupId": 0, "lastNodeId": 24, "lastLinkId": 36, "lastRerouteId": 0}, "revision": 0, "config": {}, "name": "Sharpen", "inputNode": {"id": -10, "bounding": [4090, -825, 120, 60]}, "outputNode": {"id": -20, "bounding": [5150, -825, 120, 60]}, "inputs": [{"id": "37011fb7-14b7-4e0e-b1a0-6a02e8da1fd7", "name": "images.image0", "type": "IMAGE", "linkIds": [34], "localized_name": "images.image0", "label": "image", "pos": [4190, -805]}], "outputs": [{"id": "e9182b3f-635c-4cd4-a152-4b4be17ae4b9", "name": "IMAGE0", "type": "IMAGE", "linkIds": [35], "localized_name": "IMAGE0", "label": "IMAGE", "pos": [5170, -805]}], "widgets": [], "nodes": [{"id": 24, "type": "PrimitiveFloat", "pos": [4280, -1240], "size": [270, 58], "flags": {}, "order": 0, "mode": 0, "inputs": [{"label": "strength", "localized_name": "value", "name": "value", "type": "FLOAT", "widget": {"name": "value"}, "link": null}], "outputs": [{"localized_name": "FLOAT", "name": "FLOAT", "type": "FLOAT", "links": [36]}], "properties": {"Node name for S&R": "PrimitiveFloat", "min": 0, "max": 3, "precision": 2, "step": 0.05}, "widgets_values": [0.5]}, {"id": 23, "type": "GLSLShader", "pos": [4570, -1240], "size": [370, 192], "flags": {}, "order": 1, "mode": 0, "inputs": [{"label": "image0", "localized_name": "images.image0", "name": "images.image0", "type": "IMAGE", "link": 34}, {"label": "image1", "localized_name": "images.image1", "name": "images.image1", "shape": 7, "type": "IMAGE", "link": null}, {"label": "u_float0", "localized_name": "floats.u_float0", "name": "floats.u_float0", "shape": 7, "type": "FLOAT", "link": 36}, {"label": "u_float1", "localized_name": "floats.u_float1", "name": "floats.u_float1", "shape": 7, "type": "FLOAT", "link": null}, {"label": "u_int0", "localized_name": "ints.u_int0", "name": "ints.u_int0", "shape": 7, "type": "INT", "link": null}, {"localized_name": "fragment_shader", "name": "fragment_shader", "type": "STRING", "widget": {"name": "fragment_shader"}, "link": null}, {"localized_name": "size_mode", "name": "size_mode", "type": "COMFY_DYNAMICCOMBO_V3", "widget": {"name": "size_mode"}, "link": null}], "outputs": [{"localized_name": "IMAGE0", "name": "IMAGE0", "type": "IMAGE", "links": [35]}, {"localized_name": "IMAGE1", "name": "IMAGE1", "type": "IMAGE", "links": null}, {"localized_name": "IMAGE2", "name": "IMAGE2", "type": "IMAGE", "links": null}, {"localized_name": "IMAGE3", "name": "IMAGE3", "type": "IMAGE", "links": null}], "properties": {"Node name for S&R": "GLSLShader"}, "widgets_values": ["#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\nuniform vec2 u_resolution;\nuniform float u_float0; // strength [0.0 – 2.0] typical: 0.3–1.0\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\nvoid main() {\n vec2 texel = 1.0 / u_resolution;\n \n // Sample center and neighbors\n vec4 center = texture(u_image0, v_texCoord);\n vec4 top = texture(u_image0, v_texCoord + vec2( 0.0, -texel.y));\n vec4 bottom = texture(u_image0, v_texCoord + vec2( 0.0, texel.y));\n vec4 left = texture(u_image0, v_texCoord + vec2(-texel.x, 0.0));\n vec4 right = texture(u_image0, v_texCoord + vec2( texel.x, 0.0));\n \n // Edge enhancement (Laplacian)\n vec4 edges = center * 4.0 - top - bottom - left - right;\n \n // Add edges back scaled by strength\n vec4 sharpened = center + edges * u_float0;\n \n fragColor0 = vec4(clamp(sharpened.rgb, 0.0, 1.0), center.a);\n}", "from_input"]}], "groups": [], "links": [{"id": 36, "origin_id": 24, "origin_slot": 0, "target_id": 23, "target_slot": 2, "type": "FLOAT"}, {"id": 34, "origin_id": -10, "origin_slot": 0, "target_id": 23, "target_slot": 0, "type": "IMAGE"}, {"id": 35, "origin_id": 23, "origin_slot": 0, "target_id": -20, "target_slot": 0, "type": "IMAGE"}], "extra": {"workflowRendererVersion": "LG"}, "category": "Image Tools/Sharpen"}]}}
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 25,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 25,
|
||||
"type": "621ba4e2-22a8-482d-a369-023753198b7b",
|
||||
"pos": [
|
||||
4610,
|
||||
-790
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
58
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "image",
|
||||
"localized_name": "images.image0",
|
||||
"name": "images.image0",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"label": "IMAGE",
|
||||
"localized_name": "IMAGE0",
|
||||
"name": "IMAGE0",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"title": "Sharpen",
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"24",
|
||||
"value"
|
||||
]
|
||||
]
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "621ba4e2-22a8-482d-a369-023753198b7b",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 24,
|
||||
"lastLinkId": 36,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Sharpen",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
4090,
|
||||
-825,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
5150,
|
||||
-825,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "37011fb7-14b7-4e0e-b1a0-6a02e8da1fd7",
|
||||
"name": "images.image0",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
34
|
||||
],
|
||||
"localized_name": "images.image0",
|
||||
"label": "image",
|
||||
"pos": [
|
||||
4190,
|
||||
-805
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "e9182b3f-635c-4cd4-a152-4b4be17ae4b9",
|
||||
"name": "IMAGE0",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
35
|
||||
],
|
||||
"localized_name": "IMAGE0",
|
||||
"label": "IMAGE",
|
||||
"pos": [
|
||||
5170,
|
||||
-805
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 24,
|
||||
"type": "PrimitiveFloat",
|
||||
"pos": [
|
||||
4280,
|
||||
-1240
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
58
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "strength",
|
||||
"localized_name": "value",
|
||||
"name": "value",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": [
|
||||
36
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "PrimitiveFloat",
|
||||
"min": 0,
|
||||
"max": 3,
|
||||
"precision": 2,
|
||||
"step": 0.05
|
||||
},
|
||||
"widgets_values": [
|
||||
0.5
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 23,
|
||||
"type": "GLSLShader",
|
||||
"pos": [
|
||||
4570,
|
||||
-1240
|
||||
],
|
||||
"size": [
|
||||
370,
|
||||
192
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "image0",
|
||||
"localized_name": "images.image0",
|
||||
"name": "images.image0",
|
||||
"type": "IMAGE",
|
||||
"link": 34
|
||||
},
|
||||
{
|
||||
"label": "image1",
|
||||
"localized_name": "images.image1",
|
||||
"name": "images.image1",
|
||||
"shape": 7,
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "u_float0",
|
||||
"localized_name": "floats.u_float0",
|
||||
"name": "floats.u_float0",
|
||||
"shape": 7,
|
||||
"type": "FLOAT",
|
||||
"link": 36
|
||||
},
|
||||
{
|
||||
"label": "u_float1",
|
||||
"localized_name": "floats.u_float1",
|
||||
"name": "floats.u_float1",
|
||||
"shape": 7,
|
||||
"type": "FLOAT",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "u_int0",
|
||||
"localized_name": "ints.u_int0",
|
||||
"name": "ints.u_int0",
|
||||
"shape": 7,
|
||||
"type": "INT",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "fragment_shader",
|
||||
"name": "fragment_shader",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "fragment_shader"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "size_mode",
|
||||
"name": "size_mode",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"widget": {
|
||||
"name": "size_mode"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE0",
|
||||
"name": "IMAGE0",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
35
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "IMAGE1",
|
||||
"name": "IMAGE1",
|
||||
"type": "IMAGE",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "IMAGE2",
|
||||
"name": "IMAGE2",
|
||||
"type": "IMAGE",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "IMAGE3",
|
||||
"name": "IMAGE3",
|
||||
"type": "IMAGE",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GLSLShader"
|
||||
},
|
||||
"widgets_values": [
|
||||
"#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\nuniform float u_float0; // strength [0.0 – 2.0] typical: 0.3–1.0\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\nvoid main() {\n vec2 texel = 1.0 / vec2(textureSize(u_image0, 0));\n \n // Sample center and neighbors\n vec4 center = texture(u_image0, v_texCoord);\n vec4 top = texture(u_image0, v_texCoord + vec2( 0.0, -texel.y));\n vec4 bottom = texture(u_image0, v_texCoord + vec2( 0.0, texel.y));\n vec4 left = texture(u_image0, v_texCoord + vec2(-texel.x, 0.0));\n vec4 right = texture(u_image0, v_texCoord + vec2( texel.x, 0.0));\n \n // Edge enhancement (Laplacian)\n vec4 edges = center * 4.0 - top - bottom - left - right;\n \n // Add edges back scaled by strength\n vec4 sharpened = center + edges * u_float0;\n \n fragColor0 = vec4(clamp(sharpened.rgb, 0.0, 1.0), center.a);\n}",
|
||||
"from_input"
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 36,
|
||||
"origin_id": 24,
|
||||
"origin_slot": 0,
|
||||
"target_id": 23,
|
||||
"target_slot": 2,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 34,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 23,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 35,
|
||||
"origin_id": 23,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
}
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Sharpen"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -1 +1,420 @@
|
|||
{"revision": 0, "last_node_id": 13, "last_link_id": 0, "nodes": [{"id": 13, "type": "cf95b747-3e17-46cb-8097-cac60ff9b2e1", "pos": [1120, 330], "size": [240, 58], "flags": {}, "order": 3, "mode": 0, "inputs": [{"localized_name": "video", "name": "video", "type": "VIDEO", "link": null}, {"name": "model_name", "type": "COMBO", "widget": {"name": "model_name"}, "link": null}], "outputs": [{"localized_name": "VIDEO", "name": "VIDEO", "type": "VIDEO", "links": []}], "title": "Video Upscale(GAN x4)", "properties": {"proxyWidgets": [["-1", "model_name"]], "cnr_id": "comfy-core", "ver": "0.14.1"}, "widgets_values": ["RealESRGAN_x4plus.safetensors"]}], "links": [], "version": 0.4, "definitions": {"subgraphs": [{"id": "cf95b747-3e17-46cb-8097-cac60ff9b2e1", "version": 1, "state": {"lastGroupId": 0, "lastNodeId": 13, "lastLinkId": 19, "lastRerouteId": 0}, "revision": 0, "config": {}, "name": "Video Upscale(GAN x4)", "inputNode": {"id": -10, "bounding": [550, 460, 120, 80]}, "outputNode": {"id": -20, "bounding": [1490, 460, 120, 60]}, "inputs": [{"id": "666d633e-93e7-42dc-8d11-2b7b99b0f2a6", "name": "video", "type": "VIDEO", "linkIds": [10], "localized_name": "video", "pos": [650, 480]}, {"id": "2e23a087-caa8-4d65-99e6-662761aa905a", "name": "model_name", "type": "COMBO", "linkIds": [19], "pos": [650, 500]}], "outputs": [{"id": "0c1768ea-3ec2-412f-9af6-8e0fa36dae70", "name": "VIDEO", "type": "VIDEO", "linkIds": [15], "localized_name": "VIDEO", "pos": [1510, 480]}], "widgets": [], "nodes": [{"id": 2, "type": "ImageUpscaleWithModel", "pos": [1110, 450], "size": [320, 46], "flags": {}, "order": 1, "mode": 0, "inputs": [{"localized_name": "upscale_model", "name": "upscale_model", "type": "UPSCALE_MODEL", "link": 1}, {"localized_name": "image", "name": "image", "type": "IMAGE", "link": 14}], "outputs": [{"localized_name": "IMAGE", "name": "IMAGE", "type": "IMAGE", "links": [13]}], "properties": {"cnr_id": "comfy-core", "ver": "0.10.0", "Node name for S&R": "ImageUpscaleWithModel"}}, {"id": 11, "type": "CreateVideo", "pos": [1110, 550], "size": [320, 78], "flags": {}, "order": 3, "mode": 0, "inputs": [{"localized_name": "images", "name": "images", "type": "IMAGE", "link": 13}, {"localized_name": "audio", "name": "audio", "shape": 7, "type": "AUDIO", "link": 16}, {"localized_name": "fps", "name": "fps", "type": "FLOAT", "widget": {"name": "fps"}, "link": 12}], "outputs": [{"localized_name": "VIDEO", "name": "VIDEO", "type": "VIDEO", "links": [15]}], "properties": {"cnr_id": "comfy-core", "ver": "0.10.0", "Node name for S&R": "CreateVideo"}, "widgets_values": [30]}, {"id": 10, "type": "GetVideoComponents", "pos": [1110, 330], "size": [320, 70], "flags": {}, "order": 2, "mode": 0, "inputs": [{"localized_name": "video", "name": "video", "type": "VIDEO", "link": 10}], "outputs": [{"localized_name": "images", "name": "images", "type": "IMAGE", "links": [14]}, {"localized_name": "audio", "name": "audio", "type": "AUDIO", "links": [16]}, {"localized_name": "fps", "name": "fps", "type": "FLOAT", "links": [12]}], "properties": {"cnr_id": "comfy-core", "ver": "0.10.0", "Node name for S&R": "GetVideoComponents"}}, {"id": 1, "type": "UpscaleModelLoader", "pos": [750, 450], "size": [280, 60], "flags": {}, "order": 0, "mode": 0, "inputs": [{"localized_name": "model_name", "name": "model_name", "type": "COMBO", "widget": {"name": "model_name"}, "link": 19}], "outputs": [{"localized_name": "UPSCALE_MODEL", "name": "UPSCALE_MODEL", "type": "UPSCALE_MODEL", "links": [1]}], "properties": {"cnr_id": "comfy-core", "ver": "0.10.0", "Node name for S&R": "UpscaleModelLoader", "models": [{"name": "RealESRGAN_x4plus.safetensors", "url": "https://huggingface.co/Comfy-Org/Real-ESRGAN_repackaged/resolve/main/RealESRGAN_x4plus.safetensors", "directory": "upscale_models"}]}, "widgets_values": ["RealESRGAN_x4plus.safetensors"]}], "groups": [], "links": [{"id": 1, "origin_id": 1, "origin_slot": 0, "target_id": 2, "target_slot": 0, "type": "UPSCALE_MODEL"}, {"id": 14, "origin_id": 10, "origin_slot": 0, "target_id": 2, "target_slot": 1, "type": "IMAGE"}, {"id": 13, "origin_id": 2, "origin_slot": 0, "target_id": 11, "target_slot": 0, "type": "IMAGE"}, {"id": 16, "origin_id": 10, "origin_slot": 1, "target_id": 11, "target_slot": 1, "type": "AUDIO"}, {"id": 12, "origin_id": 10, "origin_slot": 2, "target_id": 11, "target_slot": 2, "type": "FLOAT"}, {"id": 10, "origin_id": -10, "origin_slot": 0, "target_id": 10, "target_slot": 0, "type": "VIDEO"}, {"id": 15, "origin_id": 11, "origin_slot": 0, "target_id": -20, "target_slot": 0, "type": "VIDEO"}, {"id": 19, "origin_id": -10, "origin_slot": 1, "target_id": 1, "target_slot": 0, "type": "COMBO"}], "extra": {"workflowRendererVersion": "LG"}, "category": "Video generation and editing/Enhance video"}]}, "extra": {}}
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 13,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 13,
|
||||
"type": "cf95b747-3e17-46cb-8097-cac60ff9b2e1",
|
||||
"pos": [
|
||||
1120,
|
||||
330
|
||||
],
|
||||
"size": [
|
||||
240,
|
||||
58
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "VIDEO",
|
||||
"name": "VIDEO",
|
||||
"type": "VIDEO",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"title": "Video Upscale(GAN x4)",
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"-1",
|
||||
"model_name"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.14.1"
|
||||
},
|
||||
"widgets_values": [
|
||||
"RealESRGAN_x4plus.safetensors"
|
||||
]
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "cf95b747-3e17-46cb-8097-cac60ff9b2e1",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 13,
|
||||
"lastLinkId": 19,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Video Upscale(GAN x4)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
550,
|
||||
460,
|
||||
120,
|
||||
80
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
1490,
|
||||
460,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "666d633e-93e7-42dc-8d11-2b7b99b0f2a6",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"linkIds": [
|
||||
10
|
||||
],
|
||||
"localized_name": "video",
|
||||
"pos": [
|
||||
650,
|
||||
480
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "2e23a087-caa8-4d65-99e6-662761aa905a",
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
19
|
||||
],
|
||||
"pos": [
|
||||
650,
|
||||
500
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "0c1768ea-3ec2-412f-9af6-8e0fa36dae70",
|
||||
"name": "VIDEO",
|
||||
"type": "VIDEO",
|
||||
"linkIds": [
|
||||
15
|
||||
],
|
||||
"localized_name": "VIDEO",
|
||||
"pos": [
|
||||
1510,
|
||||
480
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 2,
|
||||
"type": "ImageUpscaleWithModel",
|
||||
"pos": [
|
||||
1110,
|
||||
450
|
||||
],
|
||||
"size": [
|
||||
320,
|
||||
46
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "upscale_model",
|
||||
"name": "upscale_model",
|
||||
"type": "UPSCALE_MODEL",
|
||||
"link": 1
|
||||
},
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 14
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
13
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.10.0",
|
||||
"Node name for S&R": "ImageUpscaleWithModel"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "CreateVideo",
|
||||
"pos": [
|
||||
1110,
|
||||
550
|
||||
],
|
||||
"size": [
|
||||
320,
|
||||
78
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"link": 13
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"shape": 7,
|
||||
"type": "AUDIO",
|
||||
"link": 16
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "fps"
|
||||
},
|
||||
"link": 12
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "VIDEO",
|
||||
"name": "VIDEO",
|
||||
"type": "VIDEO",
|
||||
"links": [
|
||||
15
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.10.0",
|
||||
"Node name for S&R": "CreateVideo"
|
||||
},
|
||||
"widgets_values": [
|
||||
30
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"type": "GetVideoComponents",
|
||||
"pos": [
|
||||
1110,
|
||||
330
|
||||
],
|
||||
"size": [
|
||||
320,
|
||||
70
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": 10
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
14
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"links": [
|
||||
16
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"links": [
|
||||
12
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.10.0",
|
||||
"Node name for S&R": "GetVideoComponents"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"type": "UpscaleModelLoader",
|
||||
"pos": [
|
||||
750,
|
||||
450
|
||||
],
|
||||
"size": [
|
||||
280,
|
||||
60
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "model_name",
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model_name"
|
||||
},
|
||||
"link": 19
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "UPSCALE_MODEL",
|
||||
"name": "UPSCALE_MODEL",
|
||||
"type": "UPSCALE_MODEL",
|
||||
"links": [
|
||||
1
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.10.0",
|
||||
"Node name for S&R": "UpscaleModelLoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "RealESRGAN_x4plus.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/Real-ESRGAN_repackaged/resolve/main/RealESRGAN_x4plus.safetensors",
|
||||
"directory": "upscale_models"
|
||||
}
|
||||
]
|
||||
},
|
||||
"widgets_values": [
|
||||
"RealESRGAN_x4plus.safetensors"
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 1,
|
||||
"origin_id": 1,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 0,
|
||||
"type": "UPSCALE_MODEL"
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"origin_id": 10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"origin_id": 2,
|
||||
"origin_slot": 0,
|
||||
"target_id": 11,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"origin_id": 10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 11,
|
||||
"target_slot": 1,
|
||||
"type": "AUDIO"
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"origin_id": 10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 11,
|
||||
"target_slot": 2,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 10,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"origin_id": 11,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 1,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
}
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Enhance video"
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -110,11 +110,13 @@ parser.add_argument("--preview-method", type=LatentPreviewMethod, default=Latent
|
|||
|
||||
parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")
|
||||
|
||||
CACHE_RAM_AUTO_GB = -1.0
|
||||
|
||||
cache_group = parser.add_mutually_exclusive_group()
|
||||
cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
|
||||
cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
|
||||
cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
|
||||
cache_group.add_argument("--cache-ram", nargs='?', const=4.0, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threhold the cache remove large items to free RAM. Default 4GB")
|
||||
cache_group.add_argument("--cache-ram", nargs='?', const=CACHE_RAM_AUTO_GB, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threshold the cache removes large items to free RAM. Default (when no value is provided): 25%% of system RAM (min 4GB, max 32GB).")
|
||||
|
||||
attn_group = parser.add_mutually_exclusive_group()
|
||||
attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
|
||||
|
|
|
|||
|
|
@ -611,6 +611,7 @@ class AceStepDiTModel(nn.Module):
|
|||
intermediate_size,
|
||||
patch_size,
|
||||
audio_acoustic_hidden_dim,
|
||||
condition_dim=None,
|
||||
layer_types=None,
|
||||
sliding_window=128,
|
||||
rms_norm_eps=1e-6,
|
||||
|
|
@ -640,7 +641,7 @@ class AceStepDiTModel(nn.Module):
|
|||
|
||||
self.time_embed = TimestepEmbedding(256, hidden_size, dtype=dtype, device=device, operations=operations)
|
||||
self.time_embed_r = TimestepEmbedding(256, hidden_size, dtype=dtype, device=device, operations=operations)
|
||||
self.condition_embedder = Linear(hidden_size, hidden_size, dtype=dtype, device=device)
|
||||
self.condition_embedder = Linear(condition_dim, hidden_size, dtype=dtype, device=device)
|
||||
|
||||
if layer_types is None:
|
||||
layer_types = ["full_attention"] * num_layers
|
||||
|
|
@ -1035,6 +1036,9 @@ class AceStepConditionGenerationModel(nn.Module):
|
|||
fsq_dim=2048,
|
||||
fsq_levels=[8, 8, 8, 5, 5, 5],
|
||||
fsq_input_num_quantizers=1,
|
||||
encoder_hidden_size=2048,
|
||||
encoder_intermediate_size=6144,
|
||||
encoder_num_heads=16,
|
||||
audio_model=None,
|
||||
dtype=None,
|
||||
device=None,
|
||||
|
|
@ -1054,24 +1058,24 @@ class AceStepConditionGenerationModel(nn.Module):
|
|||
|
||||
self.decoder = AceStepDiTModel(
|
||||
in_channels, hidden_size, num_dit_layers, num_heads, num_kv_heads, head_dim,
|
||||
intermediate_size, patch_size, audio_acoustic_hidden_dim,
|
||||
intermediate_size, patch_size, audio_acoustic_hidden_dim, condition_dim=encoder_hidden_size,
|
||||
layer_types=layer_types, sliding_window=sliding_window, rms_norm_eps=rms_norm_eps,
|
||||
dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
self.encoder = AceStepConditionEncoder(
|
||||
text_hidden_dim, timbre_hidden_dim, hidden_size, num_lyric_layers, num_timbre_layers,
|
||||
num_heads, num_kv_heads, head_dim, intermediate_size, rms_norm_eps,
|
||||
text_hidden_dim, timbre_hidden_dim, encoder_hidden_size, num_lyric_layers, num_timbre_layers,
|
||||
encoder_num_heads, num_kv_heads, head_dim, encoder_intermediate_size, rms_norm_eps,
|
||||
dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
self.tokenizer = AceStepAudioTokenizer(
|
||||
audio_acoustic_hidden_dim, hidden_size, pool_window_size, fsq_dim=fsq_dim, fsq_levels=fsq_levels, fsq_input_num_quantizers=fsq_input_num_quantizers, num_layers=num_tokenizer_layers, head_dim=head_dim, rms_norm_eps=rms_norm_eps,
|
||||
audio_acoustic_hidden_dim, encoder_hidden_size, pool_window_size, fsq_dim=fsq_dim, fsq_levels=fsq_levels, fsq_input_num_quantizers=fsq_input_num_quantizers, num_layers=num_tokenizer_layers, head_dim=head_dim, rms_norm_eps=rms_norm_eps,
|
||||
dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
self.detokenizer = AudioTokenDetokenizer(
|
||||
hidden_size, pool_window_size, audio_acoustic_hidden_dim, num_layers=2, head_dim=head_dim,
|
||||
encoder_hidden_size, pool_window_size, audio_acoustic_hidden_dim, num_layers=2, head_dim=head_dim,
|
||||
dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
self.null_condition_emb = nn.Parameter(torch.empty(1, 1, hidden_size, dtype=dtype, device=device))
|
||||
self.null_condition_emb = nn.Parameter(torch.empty(1, 1, encoder_hidden_size, dtype=dtype, device=device))
|
||||
|
||||
def prepare_condition(
|
||||
self,
|
||||
|
|
|
|||
|
|
@ -0,0 +1,301 @@
|
|||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
import comfy.model_management
|
||||
|
||||
def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
|
||||
assert dim % 2 == 0
|
||||
if not comfy.model_management.supports_fp64(pos.device):
|
||||
device = torch.device("cpu")
|
||||
else:
|
||||
device = pos.device
|
||||
|
||||
scale = torch.arange(0, dim, 2, dtype=torch.float64, device=device) / dim
|
||||
omega = 1.0 / (theta**scale)
|
||||
out = torch.einsum("...n,d->...nd", pos.to(device), omega)
|
||||
out = torch.stack([torch.cos(out), torch.sin(out)], dim=0)
|
||||
return out.to(dtype=torch.float32, device=pos.device)
|
||||
|
||||
def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
|
||||
rot_dim = freqs_cis.shape[-1]
|
||||
x, x_pass = x_in[..., :rot_dim], x_in[..., rot_dim:]
|
||||
cos_ = freqs_cis[0]
|
||||
sin_ = freqs_cis[1]
|
||||
x1, x2 = x.chunk(2, dim=-1)
|
||||
x_rotated = torch.cat((-x2, x1), dim=-1)
|
||||
return torch.cat((x * cos_ + x_rotated * sin_, x_pass), dim=-1)
|
||||
|
||||
class ErnieImageEmbedND3(nn.Module):
|
||||
def __init__(self, dim: int, theta: int, axes_dim: tuple):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.theta = theta
|
||||
self.axes_dim = list(axes_dim)
|
||||
|
||||
def forward(self, ids: torch.Tensor) -> torch.Tensor:
|
||||
emb = torch.cat([rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(3)], dim=-1)
|
||||
emb = emb.unsqueeze(3) # [2, B, S, 1, head_dim//2]
|
||||
return torch.stack([emb, emb], dim=-1).reshape(*emb.shape[:-1], -1) # [B, S, 1, head_dim]
|
||||
|
||||
class ErnieImagePatchEmbedDynamic(nn.Module):
|
||||
def __init__(self, in_channels: int, embed_dim: int, patch_size: int, operations, device=None, dtype=None):
|
||||
super().__init__()
|
||||
self.patch_size = patch_size
|
||||
self.proj = operations.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size, bias=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
x = self.proj(x)
|
||||
batch_size, dim, height, width = x.shape
|
||||
return x.reshape(batch_size, dim, height * width).transpose(1, 2).contiguous()
|
||||
|
||||
class Timesteps(nn.Module):
|
||||
def __init__(self, num_channels: int, flip_sin_to_cos: bool = False):
|
||||
super().__init__()
|
||||
self.num_channels = num_channels
|
||||
self.flip_sin_to_cos = flip_sin_to_cos
|
||||
|
||||
def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
|
||||
half_dim = self.num_channels // 2
|
||||
exponent = -math.log(10000) * torch.arange(half_dim, dtype=torch.float32, device=timesteps.device) / half_dim
|
||||
emb = torch.exp(exponent)
|
||||
emb = timesteps[:, None].float() * emb[None, :]
|
||||
if self.flip_sin_to_cos:
|
||||
emb = torch.cat([torch.cos(emb), torch.sin(emb)], dim=-1)
|
||||
else:
|
||||
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
|
||||
return emb
|
||||
|
||||
class TimestepEmbedding(nn.Module):
|
||||
def __init__(self, in_channels: int, time_embed_dim: int, operations, device=None, dtype=None):
|
||||
super().__init__()
|
||||
Linear = operations.Linear
|
||||
self.linear_1 = Linear(in_channels, time_embed_dim, bias=True, device=device, dtype=dtype)
|
||||
self.act = nn.SiLU()
|
||||
self.linear_2 = Linear(time_embed_dim, time_embed_dim, bias=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
sample = self.linear_1(sample)
|
||||
sample = self.act(sample)
|
||||
sample = self.linear_2(sample)
|
||||
return sample
|
||||
|
||||
class ErnieImageAttention(nn.Module):
|
||||
def __init__(self, query_dim: int, heads: int, dim_head: int, eps: float = 1e-6, operations=None, device=None, dtype=None):
|
||||
super().__init__()
|
||||
self.heads = heads
|
||||
self.head_dim = dim_head
|
||||
self.inner_dim = heads * dim_head
|
||||
|
||||
Linear = operations.Linear
|
||||
RMSNorm = operations.RMSNorm
|
||||
|
||||
self.to_q = Linear(query_dim, self.inner_dim, bias=False, device=device, dtype=dtype)
|
||||
self.to_k = Linear(query_dim, self.inner_dim, bias=False, device=device, dtype=dtype)
|
||||
self.to_v = Linear(query_dim, self.inner_dim, bias=False, device=device, dtype=dtype)
|
||||
|
||||
self.norm_q = RMSNorm(dim_head, eps=eps, elementwise_affine=True, device=device, dtype=dtype)
|
||||
self.norm_k = RMSNorm(dim_head, eps=eps, elementwise_affine=True, device=device, dtype=dtype)
|
||||
|
||||
self.to_out = nn.ModuleList([Linear(self.inner_dim, query_dim, bias=False, device=device, dtype=dtype)])
|
||||
|
||||
def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None, image_rotary_emb: torch.Tensor = None) -> torch.Tensor:
|
||||
B, S, _ = x.shape
|
||||
|
||||
q_flat = self.to_q(x)
|
||||
k_flat = self.to_k(x)
|
||||
v_flat = self.to_v(x)
|
||||
|
||||
query = q_flat.view(B, S, self.heads, self.head_dim)
|
||||
key = k_flat.view(B, S, self.heads, self.head_dim)
|
||||
|
||||
query = self.norm_q(query)
|
||||
key = self.norm_k(key)
|
||||
|
||||
if image_rotary_emb is not None:
|
||||
query = apply_rotary_emb(query, image_rotary_emb)
|
||||
key = apply_rotary_emb(key, image_rotary_emb)
|
||||
|
||||
q_flat = query.reshape(B, S, -1)
|
||||
k_flat = key.reshape(B, S, -1)
|
||||
|
||||
hidden_states = optimized_attention(q_flat, k_flat, v_flat, self.heads, mask=attention_mask)
|
||||
|
||||
return self.to_out[0](hidden_states)
|
||||
|
||||
class ErnieImageFeedForward(nn.Module):
|
||||
def __init__(self, hidden_size: int, ffn_hidden_size: int, operations, device=None, dtype=None):
|
||||
super().__init__()
|
||||
Linear = operations.Linear
|
||||
self.gate_proj = Linear(hidden_size, ffn_hidden_size, bias=False, device=device, dtype=dtype)
|
||||
self.up_proj = Linear(hidden_size, ffn_hidden_size, bias=False, device=device, dtype=dtype)
|
||||
self.linear_fc2 = Linear(ffn_hidden_size, hidden_size, bias=False, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return self.linear_fc2(self.up_proj(x) * F.gelu(self.gate_proj(x)))
|
||||
|
||||
class ErnieImageSharedAdaLNBlock(nn.Module):
|
||||
def __init__(self, hidden_size: int, num_heads: int, ffn_hidden_size: int, eps: float = 1e-6, operations=None, device=None, dtype=None):
|
||||
super().__init__()
|
||||
RMSNorm = operations.RMSNorm
|
||||
|
||||
self.adaLN_sa_ln = RMSNorm(hidden_size, eps=eps, device=device, dtype=dtype)
|
||||
self.self_attention = ErnieImageAttention(
|
||||
query_dim=hidden_size,
|
||||
dim_head=hidden_size // num_heads,
|
||||
heads=num_heads,
|
||||
eps=eps,
|
||||
operations=operations,
|
||||
device=device,
|
||||
dtype=dtype
|
||||
)
|
||||
self.adaLN_mlp_ln = RMSNorm(hidden_size, eps=eps, device=device, dtype=dtype)
|
||||
self.mlp = ErnieImageFeedForward(hidden_size, ffn_hidden_size, operations=operations, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, rotary_pos_emb, temb, attention_mask=None):
|
||||
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = temb
|
||||
|
||||
residual = x
|
||||
x_norm = self.adaLN_sa_ln(x)
|
||||
x_norm = x_norm * (1 + scale_msa) + shift_msa
|
||||
|
||||
attn_out = self.self_attention(x_norm, attention_mask=attention_mask, image_rotary_emb=rotary_pos_emb)
|
||||
x = residual + gate_msa * attn_out
|
||||
|
||||
residual = x
|
||||
x_norm = self.adaLN_mlp_ln(x)
|
||||
x_norm = x_norm * (1 + scale_mlp) + shift_mlp
|
||||
|
||||
return residual + gate_mlp * self.mlp(x_norm)
|
||||
|
||||
class ErnieImageAdaLNContinuous(nn.Module):
|
||||
def __init__(self, hidden_size: int, eps: float = 1e-6, operations=None, device=None, dtype=None):
|
||||
super().__init__()
|
||||
LayerNorm = operations.LayerNorm
|
||||
Linear = operations.Linear
|
||||
self.norm = LayerNorm(hidden_size, elementwise_affine=False, eps=eps, device=device, dtype=dtype)
|
||||
self.linear = Linear(hidden_size, hidden_size * 2, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor:
|
||||
scale, shift = self.linear(conditioning).chunk(2, dim=-1)
|
||||
x = self.norm(x)
|
||||
x = torch.addcmul(shift.unsqueeze(1), x, 1 + scale.unsqueeze(1))
|
||||
return x
|
||||
|
||||
class ErnieImageModel(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int = 4096,
|
||||
num_attention_heads: int = 32,
|
||||
num_layers: int = 36,
|
||||
ffn_hidden_size: int = 12288,
|
||||
in_channels: int = 128,
|
||||
out_channels: int = 128,
|
||||
patch_size: int = 1,
|
||||
text_in_dim: int = 3072,
|
||||
rope_theta: int = 256,
|
||||
rope_axes_dim: tuple = (32, 48, 48),
|
||||
eps: float = 1e-6,
|
||||
qk_layernorm: bool = True,
|
||||
device=None,
|
||||
dtype=None,
|
||||
operations=None,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__()
|
||||
self.dtype = dtype
|
||||
self.hidden_size = hidden_size
|
||||
self.num_heads = num_attention_heads
|
||||
self.head_dim = hidden_size // num_attention_heads
|
||||
self.patch_size = patch_size
|
||||
self.out_channels = out_channels
|
||||
|
||||
Linear = operations.Linear
|
||||
|
||||
self.x_embedder = ErnieImagePatchEmbedDynamic(in_channels, hidden_size, patch_size, operations, device, dtype)
|
||||
self.text_proj = Linear(text_in_dim, hidden_size, bias=False, device=device, dtype=dtype) if text_in_dim != hidden_size else None
|
||||
|
||||
self.time_proj = Timesteps(hidden_size, flip_sin_to_cos=False)
|
||||
self.time_embedding = TimestepEmbedding(hidden_size, hidden_size, operations, device, dtype)
|
||||
|
||||
self.pos_embed = ErnieImageEmbedND3(dim=self.head_dim, theta=rope_theta, axes_dim=rope_axes_dim)
|
||||
|
||||
self.adaLN_modulation = nn.Sequential(
|
||||
nn.SiLU(),
|
||||
Linear(hidden_size, 6 * hidden_size, device=device, dtype=dtype)
|
||||
)
|
||||
|
||||
self.layers = nn.ModuleList([
|
||||
ErnieImageSharedAdaLNBlock(hidden_size, num_attention_heads, ffn_hidden_size, eps, operations, device, dtype)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
self.final_norm = ErnieImageAdaLNContinuous(hidden_size, eps, operations, device, dtype)
|
||||
self.final_linear = Linear(hidden_size, patch_size * patch_size * out_channels, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, timesteps, context, **kwargs):
|
||||
device, dtype = x.device, x.dtype
|
||||
B, C, H, W = x.shape
|
||||
p, Hp, Wp = self.patch_size, H // self.patch_size, W // self.patch_size
|
||||
N_img = Hp * Wp
|
||||
|
||||
img_bsh = self.x_embedder(x)
|
||||
|
||||
text_bth = context
|
||||
if self.text_proj is not None and text_bth.numel() > 0:
|
||||
text_bth = self.text_proj(text_bth)
|
||||
Tmax = text_bth.shape[1]
|
||||
|
||||
hidden_states = torch.cat([img_bsh, text_bth], dim=1)
|
||||
|
||||
text_ids = torch.zeros((B, Tmax, 3), device=device, dtype=torch.float32)
|
||||
text_ids[:, :, 0] = torch.linspace(0, Tmax - 1, steps=Tmax, device=x.device, dtype=torch.float32)
|
||||
index = float(Tmax)
|
||||
|
||||
transformer_options = kwargs.get("transformer_options", {})
|
||||
rope_options = transformer_options.get("rope_options", None)
|
||||
|
||||
h_len, w_len = float(Hp), float(Wp)
|
||||
h_offset, w_offset = 0.0, 0.0
|
||||
|
||||
if rope_options is not None:
|
||||
h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0
|
||||
w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0
|
||||
index += rope_options.get("shift_t", 0.0)
|
||||
h_offset += rope_options.get("shift_y", 0.0)
|
||||
w_offset += rope_options.get("shift_x", 0.0)
|
||||
|
||||
image_ids = torch.zeros((Hp, Wp, 3), device=device, dtype=torch.float32)
|
||||
image_ids[:, :, 0] = image_ids[:, :, 1] + index
|
||||
image_ids[:, :, 1] = image_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=Hp, device=device, dtype=torch.float32).unsqueeze(1)
|
||||
image_ids[:, :, 2] = image_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=Wp, device=device, dtype=torch.float32).unsqueeze(0)
|
||||
|
||||
image_ids = image_ids.view(1, N_img, 3).expand(B, -1, -1)
|
||||
|
||||
rotary_pos_emb = self.pos_embed(torch.cat([image_ids, text_ids], dim=1)).to(x.dtype)
|
||||
del image_ids, text_ids
|
||||
|
||||
sample = self.time_proj(timesteps).to(dtype)
|
||||
c = self.time_embedding(sample)
|
||||
|
||||
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = [
|
||||
t.unsqueeze(1).contiguous() for t in self.adaLN_modulation(c).chunk(6, dim=-1)
|
||||
]
|
||||
|
||||
temb = [shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp]
|
||||
for layer in self.layers:
|
||||
hidden_states = layer(hidden_states, rotary_pos_emb, temb)
|
||||
|
||||
hidden_states = self.final_norm(hidden_states, c).type_as(hidden_states)
|
||||
|
||||
patches = self.final_linear(hidden_states)[:, :N_img, :]
|
||||
output = (
|
||||
patches.view(B, Hp, Wp, p, p, self.out_channels)
|
||||
.permute(0, 5, 1, 3, 2, 4)
|
||||
.contiguous()
|
||||
.view(B, self.out_channels, H, W)
|
||||
)
|
||||
|
||||
return output
|
||||
|
|
@ -16,7 +16,7 @@ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None, transforme
|
|||
|
||||
def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
|
||||
assert dim % 2 == 0
|
||||
if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu() or comfy.model_management.is_directml_enabled():
|
||||
if not comfy.model_management.supports_fp64(pos.device):
|
||||
device = torch.device("cpu")
|
||||
else:
|
||||
device = pos.device
|
||||
|
|
|
|||
|
|
@ -386,7 +386,7 @@ class Flux(nn.Module):
|
|||
h = max(h, ref.shape[-2] + h_offset)
|
||||
w = max(w, ref.shape[-1] + w_offset)
|
||||
|
||||
kontext, kontext_ids = self.process_img(ref, index=index, h_offset=h_offset, w_offset=w_offset)
|
||||
kontext, kontext_ids = self.process_img(ref, index=index, h_offset=h_offset, w_offset=w_offset, transformer_options=transformer_options)
|
||||
img = torch.cat([img, kontext], dim=1)
|
||||
img_ids = torch.cat([img_ids, kontext_ids], dim=1)
|
||||
ref_num_tokens.append(kontext.shape[1])
|
||||
|
|
|
|||
|
|
@ -681,6 +681,33 @@ class LTXAVModel(LTXVModel):
|
|||
additional_args["has_spatial_mask"] = has_spatial_mask
|
||||
|
||||
ax, a_latent_coords = self.a_patchifier.patchify(ax)
|
||||
|
||||
# Inject reference audio for ID-LoRA in-context conditioning
|
||||
ref_audio = kwargs.get("ref_audio", None)
|
||||
ref_audio_seq_len = 0
|
||||
if ref_audio is not None:
|
||||
ref_tokens = ref_audio["tokens"].to(dtype=ax.dtype, device=ax.device)
|
||||
if ref_tokens.shape[0] < ax.shape[0]:
|
||||
ref_tokens = ref_tokens.expand(ax.shape[0], -1, -1)
|
||||
ref_audio_seq_len = ref_tokens.shape[1]
|
||||
B = ax.shape[0]
|
||||
|
||||
# Compute negative temporal positions matching ID-LoRA convention:
|
||||
# offset by -(end_of_last_token + time_per_latent) so reference ends just before t=0
|
||||
p = self.a_patchifier
|
||||
tpl = p.hop_length * p.audio_latent_downsample_factor / p.sample_rate
|
||||
ref_start = p._get_audio_latent_time_in_sec(0, ref_audio_seq_len, torch.float32, ax.device)
|
||||
ref_end = p._get_audio_latent_time_in_sec(1, ref_audio_seq_len + 1, torch.float32, ax.device)
|
||||
time_offset = ref_end[-1].item() + tpl
|
||||
ref_start = (ref_start - time_offset).unsqueeze(0).expand(B, -1).unsqueeze(1)
|
||||
ref_end = (ref_end - time_offset).unsqueeze(0).expand(B, -1).unsqueeze(1)
|
||||
ref_pos = torch.stack([ref_start, ref_end], dim=-1)
|
||||
|
||||
additional_args["ref_audio_seq_len"] = ref_audio_seq_len
|
||||
additional_args["target_audio_seq_len"] = ax.shape[1]
|
||||
ax = torch.cat([ref_tokens, ax], dim=1)
|
||||
a_latent_coords = torch.cat([ref_pos.to(a_latent_coords), a_latent_coords], dim=2)
|
||||
|
||||
ax = self.audio_patchify_proj(ax)
|
||||
|
||||
# additional_args.update({"av_orig_shape": list(x.shape)})
|
||||
|
|
@ -721,6 +748,14 @@ class LTXAVModel(LTXVModel):
|
|||
|
||||
# Prepare audio timestep
|
||||
a_timestep = kwargs.get("a_timestep")
|
||||
ref_audio_seq_len = kwargs.get("ref_audio_seq_len", 0)
|
||||
if ref_audio_seq_len > 0 and a_timestep is not None:
|
||||
# Reference tokens must have timestep=0, expand scalar/1D timestep to per-token so ref=0 and target=sigma.
|
||||
target_len = kwargs.get("target_audio_seq_len")
|
||||
if a_timestep.dim() <= 1:
|
||||
a_timestep = a_timestep.view(-1, 1).expand(batch_size, target_len)
|
||||
ref_ts = torch.zeros(batch_size, ref_audio_seq_len, *a_timestep.shape[2:], device=a_timestep.device, dtype=a_timestep.dtype)
|
||||
a_timestep = torch.cat([ref_ts, a_timestep], dim=1)
|
||||
if a_timestep is not None:
|
||||
a_timestep_scaled = a_timestep * self.timestep_scale_multiplier
|
||||
a_timestep_flat = a_timestep_scaled.flatten()
|
||||
|
|
@ -955,6 +990,13 @@ class LTXAVModel(LTXVModel):
|
|||
v_embedded_timestep = embedded_timestep[0]
|
||||
a_embedded_timestep = embedded_timestep[1]
|
||||
|
||||
# Trim reference audio tokens before unpatchification
|
||||
ref_audio_seq_len = kwargs.get("ref_audio_seq_len", 0)
|
||||
if ref_audio_seq_len > 0:
|
||||
ax = ax[:, ref_audio_seq_len:]
|
||||
if a_embedded_timestep.shape[1] > 1:
|
||||
a_embedded_timestep = a_embedded_timestep[:, ref_audio_seq_len:]
|
||||
|
||||
# Expand compressed video timestep if needed
|
||||
if isinstance(v_embedded_timestep, CompressedTimestep):
|
||||
v_embedded_timestep = v_embedded_timestep.expand()
|
||||
|
|
|
|||
|
|
@ -4,9 +4,6 @@ import math
|
|||
import torch
|
||||
import torchaudio
|
||||
|
||||
import comfy.model_management
|
||||
import comfy.model_patcher
|
||||
import comfy.utils as utils
|
||||
from comfy.ldm.mmaudio.vae.distributions import DiagonalGaussianDistribution
|
||||
from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier
|
||||
from comfy.ldm.lightricks.vae.causal_audio_autoencoder import (
|
||||
|
|
@ -43,30 +40,6 @@ class AudioVAEComponentConfig:
|
|||
|
||||
return cls(autoencoder=audio_config, vocoder=vocoder_config)
|
||||
|
||||
|
||||
class ModelDeviceManager:
|
||||
"""Manages device placement and GPU residency for the composed model."""
|
||||
|
||||
def __init__(self, module: torch.nn.Module):
|
||||
load_device = comfy.model_management.get_torch_device()
|
||||
offload_device = comfy.model_management.vae_offload_device()
|
||||
self.patcher = comfy.model_patcher.ModelPatcher(module, load_device, offload_device)
|
||||
|
||||
def ensure_model_loaded(self) -> None:
|
||||
comfy.model_management.free_memory(
|
||||
self.patcher.model_size(),
|
||||
self.patcher.load_device,
|
||||
)
|
||||
comfy.model_management.load_model_gpu(self.patcher)
|
||||
|
||||
def move_to_load_device(self, tensor: torch.Tensor) -> torch.Tensor:
|
||||
return tensor.to(self.patcher.load_device)
|
||||
|
||||
@property
|
||||
def load_device(self):
|
||||
return self.patcher.load_device
|
||||
|
||||
|
||||
class AudioLatentNormalizer:
|
||||
"""Applies per-channel statistics in patch space and restores original layout."""
|
||||
|
||||
|
|
@ -132,23 +105,17 @@ class AudioPreprocessor:
|
|||
class AudioVAE(torch.nn.Module):
|
||||
"""High-level Audio VAE wrapper exposing encode and decode entry points."""
|
||||
|
||||
def __init__(self, state_dict: dict, metadata: dict):
|
||||
def __init__(self, metadata: dict):
|
||||
super().__init__()
|
||||
|
||||
component_config = AudioVAEComponentConfig.from_metadata(metadata)
|
||||
|
||||
vae_sd = utils.state_dict_prefix_replace(state_dict, {"audio_vae.": ""}, filter_keys=True)
|
||||
vocoder_sd = utils.state_dict_prefix_replace(state_dict, {"vocoder.": ""}, filter_keys=True)
|
||||
|
||||
self.autoencoder = CausalAudioAutoencoder(config=component_config.autoencoder)
|
||||
if "bwe" in component_config.vocoder:
|
||||
self.vocoder = VocoderWithBWE(config=component_config.vocoder)
|
||||
else:
|
||||
self.vocoder = Vocoder(config=component_config.vocoder)
|
||||
|
||||
self.autoencoder.load_state_dict(vae_sd, strict=False)
|
||||
self.vocoder.load_state_dict(vocoder_sd, strict=False)
|
||||
|
||||
autoencoder_config = self.autoencoder.get_config()
|
||||
self.normalizer = AudioLatentNormalizer(
|
||||
AudioPatchifier(
|
||||
|
|
@ -168,18 +135,12 @@ class AudioVAE(torch.nn.Module):
|
|||
n_fft=autoencoder_config["n_fft"],
|
||||
)
|
||||
|
||||
self.device_manager = ModelDeviceManager(self)
|
||||
|
||||
def encode(self, audio: dict) -> torch.Tensor:
|
||||
def encode(self, audio, sample_rate=44100) -> torch.Tensor:
|
||||
"""Encode a waveform dictionary into normalized latent tensors."""
|
||||
|
||||
waveform = audio["waveform"]
|
||||
waveform_sample_rate = audio["sample_rate"]
|
||||
waveform = audio
|
||||
waveform_sample_rate = sample_rate
|
||||
input_device = waveform.device
|
||||
# Ensure that Audio VAE is loaded on the correct device.
|
||||
self.device_manager.ensure_model_loaded()
|
||||
|
||||
waveform = self.device_manager.move_to_load_device(waveform)
|
||||
expected_channels = self.autoencoder.encoder.in_channels
|
||||
if waveform.shape[1] != expected_channels:
|
||||
if waveform.shape[1] == 1:
|
||||
|
|
@ -190,7 +151,7 @@ class AudioVAE(torch.nn.Module):
|
|||
)
|
||||
|
||||
mel_spec = self.preprocessor.waveform_to_mel(
|
||||
waveform, waveform_sample_rate, device=self.device_manager.load_device
|
||||
waveform, waveform_sample_rate, device=waveform.device
|
||||
)
|
||||
|
||||
latents = self.autoencoder.encode(mel_spec)
|
||||
|
|
@ -204,17 +165,13 @@ class AudioVAE(torch.nn.Module):
|
|||
"""Decode normalized latent tensors into an audio waveform."""
|
||||
original_shape = latents.shape
|
||||
|
||||
# Ensure that Audio VAE is loaded on the correct device.
|
||||
self.device_manager.ensure_model_loaded()
|
||||
|
||||
latents = self.device_manager.move_to_load_device(latents)
|
||||
latents = self.normalizer.denormalize(latents)
|
||||
|
||||
target_shape = self.target_shape_from_latents(original_shape)
|
||||
mel_spec = self.autoencoder.decode(latents, target_shape=target_shape)
|
||||
|
||||
waveform = self.run_vocoder(mel_spec)
|
||||
return self.device_manager.move_to_load_device(waveform)
|
||||
return waveform
|
||||
|
||||
def target_shape_from_latents(self, latents_shape):
|
||||
batch, _, time, _ = latents_shape
|
||||
|
|
|
|||
|
|
@ -155,6 +155,7 @@ class AutoencodingEngineLegacy(AutoencodingEngine):
|
|||
def __init__(self, embed_dim: int, **kwargs):
|
||||
self.max_batch_size = kwargs.pop("max_batch_size", None)
|
||||
ddconfig = kwargs.pop("ddconfig")
|
||||
decoder_ddconfig = kwargs.pop("decoder_ddconfig", ddconfig)
|
||||
super().__init__(
|
||||
encoder_config={
|
||||
"target": "comfy.ldm.modules.diffusionmodules.model.Encoder",
|
||||
|
|
@ -162,7 +163,7 @@ class AutoencodingEngineLegacy(AutoencodingEngine):
|
|||
},
|
||||
decoder_config={
|
||||
"target": "comfy.ldm.modules.diffusionmodules.model.Decoder",
|
||||
"params": ddconfig,
|
||||
"params": decoder_ddconfig,
|
||||
},
|
||||
**kwargs,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -34,6 +34,16 @@ class TimestepBlock(nn.Module):
|
|||
#This is needed because accelerate makes a copy of transformer_options which breaks "transformer_index"
|
||||
def forward_timestep_embed(ts, x, emb, context=None, transformer_options={}, output_shape=None, time_context=None, num_video_frames=None, image_only_indicator=None):
|
||||
for layer in ts:
|
||||
if "patches" in transformer_options and "forward_timestep_embed_patch" in transformer_options["patches"]:
|
||||
found_patched = False
|
||||
for class_type, handler in transformer_options["patches"]["forward_timestep_embed_patch"]:
|
||||
if isinstance(layer, class_type):
|
||||
x = handler(layer, x, emb, context, transformer_options, output_shape, time_context, num_video_frames, image_only_indicator)
|
||||
found_patched = True
|
||||
break
|
||||
if found_patched:
|
||||
continue
|
||||
|
||||
if isinstance(layer, VideoResBlock):
|
||||
x = layer(x, emb, num_video_frames, image_only_indicator)
|
||||
elif isinstance(layer, TimestepBlock):
|
||||
|
|
@ -49,15 +59,6 @@ def forward_timestep_embed(ts, x, emb, context=None, transformer_options={}, out
|
|||
elif isinstance(layer, Upsample):
|
||||
x = layer(x, output_shape=output_shape)
|
||||
else:
|
||||
if "patches" in transformer_options and "forward_timestep_embed_patch" in transformer_options["patches"]:
|
||||
found_patched = False
|
||||
for class_type, handler in transformer_options["patches"]["forward_timestep_embed_patch"]:
|
||||
if isinstance(layer, class_type):
|
||||
x = handler(layer, x, emb, context, transformer_options, output_shape, time_context, num_video_frames, image_only_indicator)
|
||||
found_patched = True
|
||||
break
|
||||
if found_patched:
|
||||
continue
|
||||
x = layer(x)
|
||||
return x
|
||||
|
||||
|
|
@ -894,6 +895,12 @@ class UNetModel(nn.Module):
|
|||
h = forward_timestep_embed(self.middle_block, h, emb, context, transformer_options, time_context=time_context, num_video_frames=num_video_frames, image_only_indicator=image_only_indicator)
|
||||
h = apply_control(h, control, 'middle')
|
||||
|
||||
if "middle_block_after_patch" in transformer_patches:
|
||||
patch = transformer_patches["middle_block_after_patch"]
|
||||
for p in patch:
|
||||
out = p({"h": h, "x": x, "emb": emb, "context": context, "y": y,
|
||||
"timesteps": timesteps, "transformer_options": transformer_options})
|
||||
h = out["h"]
|
||||
|
||||
for id, module in enumerate(self.output_blocks):
|
||||
transformer_options["block"] = ("output", id)
|
||||
|
|
@ -905,8 +912,9 @@ class UNetModel(nn.Module):
|
|||
for p in patch:
|
||||
h, hsp = p(h, hsp, transformer_options)
|
||||
|
||||
h = th.cat([h, hsp], dim=1)
|
||||
del hsp
|
||||
if hsp is not None:
|
||||
h = th.cat([h, hsp], dim=1)
|
||||
del hsp
|
||||
if len(hs) > 0:
|
||||
output_shape = hs[-1].shape
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -3,12 +3,9 @@ from ..diffusionmodules.openaimodel import Timestep
|
|||
import torch
|
||||
|
||||
class CLIPEmbeddingNoiseAugmentation(ImageConcatWithNoiseAugmentation):
|
||||
def __init__(self, *args, clip_stats_path=None, timestep_dim=256, **kwargs):
|
||||
def __init__(self, *args, timestep_dim=256, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
if clip_stats_path is None:
|
||||
clip_mean, clip_std = torch.zeros(timestep_dim), torch.ones(timestep_dim)
|
||||
else:
|
||||
clip_mean, clip_std = torch.load(clip_stats_path, map_location="cpu")
|
||||
clip_mean, clip_std = torch.zeros(timestep_dim), torch.ones(timestep_dim)
|
||||
self.register_buffer("data_mean", clip_mean[None, :], persistent=False)
|
||||
self.register_buffer("data_std", clip_std[None, :], persistent=False)
|
||||
self.time_embed = Timestep(timestep_dim)
|
||||
|
|
|
|||
|
|
@ -90,7 +90,7 @@ class HeatmapHead(torch.nn.Module):
|
|||
origin_max = np.max(hm[k])
|
||||
dr = np.zeros((H + 2 * border, W + 2 * border), dtype=np.float32)
|
||||
dr[border:-border, border:-border] = hm[k].copy()
|
||||
dr = gaussian_filter(dr, sigma=2.0)
|
||||
dr = gaussian_filter(dr, sigma=2.0, truncate=2.5)
|
||||
hm[k] = dr[border:-border, border:-border].copy()
|
||||
cur_max = np.max(hm[k])
|
||||
if cur_max > 0:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,725 @@
|
|||
from collections import OrderedDict
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torchvision
|
||||
import comfy.model_management
|
||||
from comfy.ldm.modules.attention import optimized_attention_for_device
|
||||
|
||||
COCO_CLASSES = [
|
||||
'person','bicycle','car','motorcycle','airplane','bus','train','truck','boat',
|
||||
'traffic light','fire hydrant','stop sign','parking meter','bench','bird','cat',
|
||||
'dog','horse','sheep','cow','elephant','bear','zebra','giraffe','backpack',
|
||||
'umbrella','handbag','tie','suitcase','frisbee','skis','snowboard','sports ball',
|
||||
'kite','baseball bat','baseball glove','skateboard','surfboard','tennis racket',
|
||||
'bottle','wine glass','cup','fork','knife','spoon','bowl','banana','apple',
|
||||
'sandwich','orange','broccoli','carrot','hot dog','pizza','donut','cake','chair',
|
||||
'couch','potted plant','bed','dining table','toilet','tv','laptop','mouse',
|
||||
'remote','keyboard','cell phone','microwave','oven','toaster','sink',
|
||||
'refrigerator','book','clock','vase','scissors','teddy bear','hair drier','toothbrush',
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HGNetv2 backbone
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class ConvBNAct(nn.Module):
|
||||
"""Conv→BN→ReLU. padding='same' adds asymmetric zero-pad (stem)."""
|
||||
def __init__(self, ic, oc, k=3, s=1, groups=1, use_act=True, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
|
||||
self.conv = operations.Conv2d(ic, oc, k, s, (k - 1) // 2, groups=groups, bias=False, device=device, dtype=dtype)
|
||||
self.bn = nn.BatchNorm2d(oc, device=device, dtype=dtype)
|
||||
self.act = nn.ReLU() if use_act else nn.Identity()
|
||||
|
||||
def forward(self, x):
|
||||
return self.act(self.bn(self.conv(x)))
|
||||
|
||||
class LightConvBNAct(nn.Module):
|
||||
def __init__(self, ic, oc, k, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.conv1 = ConvBNAct(ic, oc, 1, use_act=False, device=device, dtype=dtype, operations=operations)
|
||||
self.conv2 = ConvBNAct(oc, oc, k, groups=oc, use_act=True, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
def forward(self, x):
|
||||
return self.conv2(self.conv1(x))
|
||||
|
||||
class _StemBlock(nn.Module):
|
||||
def __init__(self, ic, mc, oc, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.stem1 = ConvBNAct(ic, mc, 3, 2, device=device, dtype=dtype, operations=operations)
|
||||
# stem2a/stem2b use kernel=2, stride=1, no internal padding;
|
||||
# padding is applied manually in forward (matching PaddlePaddle original)
|
||||
self.stem2a = ConvBNAct(mc, mc//2, 2, 1, device=device, dtype=dtype, operations=operations)
|
||||
self.stem2b = ConvBNAct(mc//2, mc, 2, 1, device=device, dtype=dtype, operations=operations)
|
||||
self.stem3 = ConvBNAct(mc*2, mc, 3, 2, device=device, dtype=dtype, operations=operations)
|
||||
self.stem4 = ConvBNAct(mc, oc, 1, device=device, dtype=dtype, operations=operations)
|
||||
self.pool = nn.MaxPool2d(2, 1, ceil_mode=True)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.stem1(x)
|
||||
x = F.pad(x, (0, 1, 0, 1)) # pad before pool and stem2a
|
||||
x2 = self.stem2a(x)
|
||||
x2 = F.pad(x2, (0, 1, 0, 1)) # pad before stem2b
|
||||
x2 = self.stem2b(x2)
|
||||
x1 = self.pool(x)
|
||||
return self.stem4(self.stem3(torch.cat([x1, x2], 1)))
|
||||
|
||||
|
||||
class _HG_Block(nn.Module):
|
||||
def __init__(self, ic, mc, oc, layer_num, k=3, residual=False, light=False, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.residual = residual
|
||||
if light:
|
||||
self.layers = nn.ModuleList(
|
||||
[LightConvBNAct(ic if i == 0 else mc, mc, k, device=device, dtype=dtype, operations=operations) for i in range(layer_num)])
|
||||
else:
|
||||
self.layers = nn.ModuleList(
|
||||
[ConvBNAct(ic if i == 0 else mc, mc, k, device=device, dtype=dtype, operations=operations) for i in range(layer_num)])
|
||||
total = ic + layer_num * mc
|
||||
|
||||
self.aggregation = nn.Sequential(
|
||||
ConvBNAct(total, oc // 2, 1, device=device, dtype=dtype, operations=operations),
|
||||
ConvBNAct(oc // 2, oc, 1, device=device, dtype=dtype, operations=operations))
|
||||
|
||||
def forward(self, x):
|
||||
identity = x
|
||||
outs = [x]
|
||||
for layer in self.layers:
|
||||
x = layer(x)
|
||||
outs.append(x)
|
||||
x = self.aggregation(torch.cat(outs, 1))
|
||||
return x + identity if self.residual else x
|
||||
|
||||
|
||||
class _HG_Stage(nn.Module):
|
||||
# config order: ic, mc, oc, num_blocks, downsample, light, k, layer_num
|
||||
def __init__(self, ic, mc, oc, num_blocks, downsample=True, light=False, k=3, layer_num=6, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
if downsample:
|
||||
self.downsample = ConvBNAct(ic, ic, 3, 2, groups=ic, use_act=False, device=device, dtype=dtype, operations=operations)
|
||||
else:
|
||||
self.downsample = nn.Identity()
|
||||
self.blocks = nn.Sequential(*[
|
||||
_HG_Block(ic if i == 0 else oc, mc, oc, layer_num,
|
||||
k=k, residual=(i != 0), light=light, device=device, dtype=dtype, operations=operations)
|
||||
for i in range(num_blocks)
|
||||
])
|
||||
|
||||
def forward(self, x):
|
||||
return self.blocks(self.downsample(x))
|
||||
|
||||
|
||||
class HGNetv2(nn.Module):
|
||||
# B5 config: stem=[3,32,64], stages=[ic, mc, oc, blocks, down, light, k, layers]
|
||||
_STAGE_CFGS = [[64, 64, 128, 1, False, False, 3, 6],
|
||||
[128, 128, 512, 2, True, False, 3, 6],
|
||||
[512, 256, 1024, 5, True, True, 5, 6],
|
||||
[1024,512, 2048, 2, True, True, 5, 6]]
|
||||
|
||||
def __init__(self, return_idx=(1, 2, 3), device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.stem = _StemBlock(3, 32, 64, device=device, dtype=dtype, operations=operations)
|
||||
self.stages = nn.ModuleList([_HG_Stage(*cfg, device=device, dtype=dtype, operations=operations) for cfg in self._STAGE_CFGS])
|
||||
self.return_idx = list(return_idx)
|
||||
self.out_channels = [self._STAGE_CFGS[i][2] for i in return_idx]
|
||||
|
||||
def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
|
||||
x = self.stem(x)
|
||||
outs = []
|
||||
for i, stage in enumerate(self.stages):
|
||||
x = stage(x)
|
||||
if i in self.return_idx:
|
||||
outs.append(x)
|
||||
return outs
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Encoder — HybridEncoder (dfine version: RepNCSPELAN4 + SCDown PAN)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class ConvNormLayer(nn.Module):
|
||||
"""Conv→act (expects pre-fused BN weights)."""
|
||||
def __init__(self, ic, oc, k, s, g=1, padding=None, act=None, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
p = (k - 1) // 2 if padding is None else padding
|
||||
self.conv = operations.Conv2d(ic, oc, k, s, p, groups=g, bias=True, device=device, dtype=dtype)
|
||||
self.act = nn.SiLU() if act == 'silu' else nn.Identity()
|
||||
|
||||
def forward(self, x):
|
||||
return self.act(self.conv(x))
|
||||
|
||||
|
||||
class VGGBlock(nn.Module):
|
||||
"""Rep-VGG block (expects pre-fused weights)."""
|
||||
def __init__(self, ic, oc, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.conv = operations.Conv2d(ic, oc, 3, 1, padding=1, bias=True, device=device, dtype=dtype)
|
||||
self.act = nn.SiLU()
|
||||
|
||||
def forward(self, x):
|
||||
return self.act(self.conv(x))
|
||||
|
||||
|
||||
class CSPLayer(nn.Module):
|
||||
def __init__(self, ic, oc, num_blocks=3, expansion=1.0, act='silu', device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
h = int(oc * expansion)
|
||||
self.conv1 = ConvNormLayer(ic, h, 1, 1, act=act, device=device, dtype=dtype, operations=operations)
|
||||
self.conv2 = ConvNormLayer(ic, h, 1, 1, act=act, device=device, dtype=dtype, operations=operations)
|
||||
self.bottlenecks = nn.Sequential(*[VGGBlock(h, h, device=device, dtype=dtype, operations=operations) for _ in range(num_blocks)])
|
||||
self.conv3 = ConvNormLayer(h, oc, 1, 1, act=act, device=device, dtype=dtype, operations=operations) if h != oc else nn.Identity()
|
||||
|
||||
def forward(self, x):
|
||||
return self.conv3(self.bottlenecks(self.conv1(x)) + self.conv2(x))
|
||||
|
||||
|
||||
class RepNCSPELAN4(nn.Module):
|
||||
"""CSP-ELAN block — the FPN/PAN block in RTv4's HybridEncoder."""
|
||||
def __init__(self, c1, c2, c3, c4, n=3, act='silu', device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.c = c3 // 2
|
||||
self.cv1 = ConvNormLayer(c1, c3, 1, 1, act=act, device=device, dtype=dtype, operations=operations)
|
||||
self.cv2 = nn.Sequential(CSPLayer(c3 // 2, c4, n, 1.0, act=act, device=device, dtype=dtype, operations=operations), ConvNormLayer(c4, c4, 3, 1, act=act, device=device, dtype=dtype, operations=operations))
|
||||
self.cv3 = nn.Sequential(CSPLayer(c4, c4, n, 1.0, act=act, device=device, dtype=dtype, operations=operations), ConvNormLayer(c4, c4, 3, 1, act=act, device=device, dtype=dtype, operations=operations))
|
||||
self.cv4 = ConvNormLayer(c3 + 2 * c4, c2, 1, 1, act=act, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
def forward(self, x):
|
||||
y = list(self.cv1(x).split((self.c, self.c), 1))
|
||||
y.extend(m(y[-1]) for m in [self.cv2, self.cv3])
|
||||
return self.cv4(torch.cat(y, 1))
|
||||
|
||||
|
||||
class SCDown(nn.Module):
|
||||
"""Separable conv downsampling used in HybridEncoder PAN bottom-up path."""
|
||||
def __init__(self, ic, oc, k, s, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.cv1 = ConvNormLayer(ic, oc, 1, 1, device=device, dtype=dtype, operations=operations)
|
||||
self.cv2 = ConvNormLayer(oc, oc, k, s, g=oc, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
def forward(self, x):
|
||||
return self.cv2(self.cv1(x))
|
||||
|
||||
|
||||
class SelfAttention(nn.Module):
|
||||
def __init__(self, embed_dim, num_heads, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.embed_dim = embed_dim
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = embed_dim // num_heads
|
||||
self.q_proj = operations.Linear(embed_dim, embed_dim, device=device, dtype=dtype)
|
||||
self.k_proj = operations.Linear(embed_dim, embed_dim, device=device, dtype=dtype)
|
||||
self.v_proj = operations.Linear(embed_dim, embed_dim, device=device, dtype=dtype)
|
||||
self.out_proj = operations.Linear(embed_dim, embed_dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, query, key, value, attn_mask=None):
|
||||
optimized_attention = optimized_attention_for_device(query.device, False, small_input=True)
|
||||
q, k, v = self.q_proj(query), self.k_proj(key), self.v_proj(value)
|
||||
out = optimized_attention(q, k, v, heads=self.num_heads, mask=attn_mask)
|
||||
return self.out_proj(out)
|
||||
|
||||
|
||||
class _TransformerEncoderLayer(nn.Module):
|
||||
"""Single AIFI encoder layer (pre- or post-norm, GELU by default)."""
|
||||
def __init__(self, d_model, nhead, dim_feedforward, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.self_attn = SelfAttention(d_model, nhead, device=device, dtype=dtype, operations=operations)
|
||||
self.linear1 = operations.Linear(d_model, dim_feedforward, device=device, dtype=dtype)
|
||||
self.linear2 = operations.Linear(dim_feedforward, d_model, device=device, dtype=dtype)
|
||||
self.norm1 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.norm2 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.activation = nn.GELU()
|
||||
|
||||
def forward(self, src, src_mask=None, pos_embed=None):
|
||||
q = k = src if pos_embed is None else src + pos_embed
|
||||
src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)
|
||||
src = self.norm1(src + src2)
|
||||
src2 = self.linear2(self.activation(self.linear1(src)))
|
||||
return self.norm2(src + src2)
|
||||
|
||||
|
||||
class _TransformerEncoder(nn.Module):
|
||||
"""Thin wrapper so state-dict keys are encoder.0.layers.N.*"""
|
||||
def __init__(self, num_layers, d_model, nhead, dim_feedforward, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.layers = nn.ModuleList([
|
||||
_TransformerEncoderLayer(d_model, nhead, dim_feedforward, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
def forward(self, src, src_mask=None, pos_embed=None):
|
||||
for layer in self.layers:
|
||||
src = layer(src, src_mask=src_mask, pos_embed=pos_embed)
|
||||
return src
|
||||
|
||||
|
||||
class HybridEncoder(nn.Module):
|
||||
def __init__(self, in_channels=(512, 1024, 2048), feat_strides=(8, 16, 32), hidden_dim=256, nhead=8, dim_feedforward=2048, use_encoder_idx=(2,), num_encoder_layers=1,
|
||||
pe_temperature=10000, expansion=1.0, depth_mult=1.0, act='silu', eval_spatial_size=(640, 640), device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.in_channels = list(in_channels)
|
||||
self.feat_strides = list(feat_strides)
|
||||
self.hidden_dim = hidden_dim
|
||||
self.use_encoder_idx = list(use_encoder_idx)
|
||||
self.pe_temperature = pe_temperature
|
||||
self.eval_spatial_size = eval_spatial_size
|
||||
self.out_channels = [hidden_dim] * len(in_channels)
|
||||
self.out_strides = list(feat_strides)
|
||||
|
||||
# channel projection (expects pre-fused weights)
|
||||
self.input_proj = nn.ModuleList([
|
||||
nn.Sequential(OrderedDict([('conv', operations.Conv2d(ch, hidden_dim, 1, bias=True, device=device, dtype=dtype))]))
|
||||
for ch in in_channels
|
||||
])
|
||||
|
||||
# AIFI transformer — use _TransformerEncoder so keys are encoder.0.layers.N.*
|
||||
self.encoder = nn.ModuleList([
|
||||
_TransformerEncoder(num_encoder_layers, hidden_dim, nhead, dim_feedforward, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(len(use_encoder_idx))
|
||||
])
|
||||
|
||||
nb = round(3 * depth_mult)
|
||||
exp = expansion
|
||||
|
||||
# top-down FPN (dfine: lateral conv has no act)
|
||||
self.lateral_convs = nn.ModuleList(
|
||||
[ConvNormLayer(hidden_dim, hidden_dim, 1, 1, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(len(in_channels) - 1)])
|
||||
self.fpn_blocks = nn.ModuleList(
|
||||
[RepNCSPELAN4(hidden_dim * 2, hidden_dim, hidden_dim * 2, round(exp * hidden_dim // 2), nb, act=act, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(len(in_channels) - 1)])
|
||||
|
||||
# bottom-up PAN (dfine: nn.Sequential(SCDown) — keeps checkpoint key .0.cv1/.0.cv2)
|
||||
self.downsample_convs = nn.ModuleList(
|
||||
[nn.Sequential(SCDown(hidden_dim, hidden_dim, 3, 2, device=device, dtype=dtype, operations=operations))
|
||||
for _ in range(len(in_channels) - 1)])
|
||||
self.pan_blocks = nn.ModuleList(
|
||||
[RepNCSPELAN4(hidden_dim * 2, hidden_dim, hidden_dim * 2, round(exp * hidden_dim // 2), nb, act=act, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(len(in_channels) - 1)])
|
||||
|
||||
# cache positional embeddings for fixed spatial size
|
||||
if eval_spatial_size:
|
||||
for idx in self.use_encoder_idx:
|
||||
stride = self.feat_strides[idx]
|
||||
pe = self._build_pe(eval_spatial_size[1] // stride,
|
||||
eval_spatial_size[0] // stride,
|
||||
hidden_dim, pe_temperature)
|
||||
setattr(self, f'pos_embed{idx}', pe)
|
||||
|
||||
@staticmethod
|
||||
def _build_pe(w, h, dim=256, temp=10000.):
|
||||
assert dim % 4 == 0
|
||||
gw = torch.arange(w, dtype=torch.float32)
|
||||
gh = torch.arange(h, dtype=torch.float32)
|
||||
gw, gh = torch.meshgrid(gw, gh, indexing='ij')
|
||||
pdim = dim // 4
|
||||
omega = 1. / (temp ** (torch.arange(pdim, dtype=torch.float32) / pdim))
|
||||
ow = gw.flatten()[:, None] @ omega[None]
|
||||
oh = gh.flatten()[:, None] @ omega[None]
|
||||
return torch.cat([ow.sin(), ow.cos(), oh.sin(), oh.cos()], 1)[None]
|
||||
|
||||
def forward(self, feats: List[torch.Tensor]) -> List[torch.Tensor]:
|
||||
proj = [self.input_proj[i](f) for i, f in enumerate(feats)]
|
||||
|
||||
for i, enc_idx in enumerate(self.use_encoder_idx):
|
||||
h, w = proj[enc_idx].shape[2:]
|
||||
src = proj[enc_idx].flatten(2).permute(0, 2, 1)
|
||||
pe = getattr(self, f'pos_embed{enc_idx}').to(device=src.device, dtype=src.dtype)
|
||||
for layer in self.encoder[i].layers:
|
||||
src = layer(src, pos_embed=pe)
|
||||
proj[enc_idx] = src.permute(0, 2, 1).reshape(-1, self.hidden_dim, h, w).contiguous()
|
||||
|
||||
n = len(self.in_channels)
|
||||
inner = [proj[-1]]
|
||||
for k in range(n - 1, 0, -1):
|
||||
j = n - 1 - k
|
||||
top = self.lateral_convs[j](inner[0])
|
||||
inner[0] = top
|
||||
up = F.interpolate(top, scale_factor=2., mode='nearest')
|
||||
inner.insert(0, self.fpn_blocks[j](torch.cat([up, proj[k - 1]], 1)))
|
||||
|
||||
outs = [inner[0]]
|
||||
for k in range(n - 1):
|
||||
outs.append(self.pan_blocks[k](
|
||||
torch.cat([self.downsample_convs[k](outs[-1]), inner[k + 1]], 1)))
|
||||
return outs
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Decoder — DFINETransformer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _deformable_attn_v2(value: list, spatial_shapes, sampling_locations: torch.Tensor, attention_weights: torch.Tensor, num_points_list: List[int]) -> torch.Tensor:
|
||||
"""
|
||||
value : list of per-level tensors [bs*n_head, c, h_l, w_l]
|
||||
sampling_locations: [bs, Lq, n_head, sum(pts), 2] in [0,1]
|
||||
attention_weights : [bs, Lq, n_head, sum(pts)]
|
||||
"""
|
||||
_, c = value[0].shape[:2] # bs*n_head, c
|
||||
_, Lq, n_head, _, _ = sampling_locations.shape
|
||||
bs = sampling_locations.shape[0]
|
||||
n_h = n_head
|
||||
|
||||
grids = (2 * sampling_locations - 1) # [bs, Lq, n_head, sum_pts, 2]
|
||||
grids = grids.permute(0, 2, 1, 3, 4).flatten(0, 1) # [bs*n_head, Lq, sum_pts, 2]
|
||||
grids_per_lvl = grids.split(num_points_list, dim=2) # list of [bs*n_head, Lq, pts_l, 2]
|
||||
|
||||
sampled = []
|
||||
for lvl, (h, w) in enumerate(spatial_shapes):
|
||||
val_l = value[lvl].reshape(bs * n_h, c, h, w)
|
||||
sv = F.grid_sample(val_l, grids_per_lvl[lvl], mode='bilinear', padding_mode='zeros', align_corners=False)
|
||||
sampled.append(sv) # sv: [bs*n_head, c, Lq, pts_l]
|
||||
|
||||
attn = attention_weights.permute(0, 2, 1, 3) # [bs, n_head, Lq, sum_pts]
|
||||
attn = attn.flatten(0, 1).unsqueeze(1) # [bs*n_head, 1, Lq, sum_pts]
|
||||
out = (torch.cat(sampled, -1) * attn).sum(-1) # [bs*n_head, c, Lq]
|
||||
out = out.reshape(bs, n_h * c, Lq)
|
||||
return out.permute(0, 2, 1) # [bs, Lq, hidden]
|
||||
|
||||
|
||||
class MSDeformableAttention(nn.Module):
|
||||
def __init__(self, embed_dim=256, num_heads=8, num_levels=3, num_points=4, offset_scale=0.5, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.embed_dim, self.num_heads = embed_dim, num_heads
|
||||
self.head_dim = embed_dim // num_heads
|
||||
pts = num_points if isinstance(num_points, list) else [num_points] * num_levels
|
||||
self.num_points_list = pts
|
||||
self.offset_scale = offset_scale
|
||||
total = num_heads * sum(pts)
|
||||
self.register_buffer('num_points_scale', torch.tensor([1. / n for n in pts for _ in range(n)], dtype=torch.float32))
|
||||
self.sampling_offsets = operations.Linear(embed_dim, total * 2, device=device, dtype=dtype)
|
||||
self.attention_weights = operations.Linear(embed_dim, total, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, query, ref_pts, value, spatial_shapes):
|
||||
bs, Lq = query.shape[:2]
|
||||
offsets = self.sampling_offsets(query).reshape(
|
||||
bs, Lq, self.num_heads, sum(self.num_points_list), 2)
|
||||
attn_w = F.softmax(
|
||||
self.attention_weights(query).reshape(
|
||||
bs, Lq, self.num_heads, sum(self.num_points_list)), -1)
|
||||
scale = self.num_points_scale.to(query).unsqueeze(-1)
|
||||
offset = offsets * scale * ref_pts[:, :, None, :, 2:] * self.offset_scale
|
||||
locs = ref_pts[:, :, None, :, :2] + offset # [bs, Lq, n_head, sum_pts, 2]
|
||||
return _deformable_attn_v2(value, spatial_shapes, locs, attn_w, self.num_points_list)
|
||||
|
||||
|
||||
class Gate(nn.Module):
|
||||
def __init__(self, d_model, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.gate = operations.Linear(2 * d_model, 2 * d_model, device=device, dtype=dtype)
|
||||
self.norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x1, x2):
|
||||
g1, g2 = torch.sigmoid(self.gate(torch.cat([x1, x2], -1))).chunk(2, -1)
|
||||
return self.norm(g1 * x1 + g2 * x2)
|
||||
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, in_dim, hidden_dim, out_dim, num_layers, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
dims = [in_dim] + [hidden_dim] * (num_layers - 1) + [out_dim]
|
||||
self.layers = nn.ModuleList(operations.Linear(dims[i], dims[i + 1], device=device, dtype=dtype) for i in range(num_layers))
|
||||
|
||||
def forward(self, x):
|
||||
for i, layer in enumerate(self.layers):
|
||||
x = nn.SiLU()(layer(x)) if i < len(self.layers) - 1 else layer(x)
|
||||
return x
|
||||
|
||||
|
||||
class TransformerDecoderLayer(nn.Module):
|
||||
def __init__(self, d_model=256, nhead=8, dim_feedforward=1024, num_levels=3, num_points=4, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.self_attn = SelfAttention(d_model, nhead, device=device, dtype=dtype, operations=operations)
|
||||
self.norm1 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.cross_attn = MSDeformableAttention(d_model, nhead, num_levels, num_points, device=device, dtype=dtype, operations=operations)
|
||||
self.gateway = Gate(d_model, device=device, dtype=dtype, operations=operations)
|
||||
self.linear1 = operations.Linear(d_model, dim_feedforward, device=device, dtype=dtype)
|
||||
self.activation = nn.ReLU()
|
||||
self.linear2 = operations.Linear(dim_feedforward, d_model, device=device, dtype=dtype)
|
||||
self.norm3 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, target, ref_pts, value, spatial_shapes, attn_mask=None, query_pos=None):
|
||||
q = k = target if query_pos is None else target + query_pos
|
||||
t2 = self.self_attn(q, k, value=target, attn_mask=attn_mask)
|
||||
target = self.norm1(target + t2)
|
||||
t2 = self.cross_attn(
|
||||
target if query_pos is None else target + query_pos,
|
||||
ref_pts, value, spatial_shapes)
|
||||
target = self.gateway(target, t2)
|
||||
t2 = self.linear2(self.activation(self.linear1(target)))
|
||||
target = self.norm3((target + t2).clamp(-65504, 65504))
|
||||
return target
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FDR utilities
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def weighting_function(reg_max, up, reg_scale):
|
||||
"""Non-uniform weighting function W(n) for FDR box regression."""
|
||||
ub1 = (abs(up[0]) * abs(reg_scale)).item()
|
||||
ub2 = ub1 * 2
|
||||
step = (ub1 + 1) ** (2 / (reg_max - 2))
|
||||
left = [-(step ** i) + 1 for i in range(reg_max // 2 - 1, 0, -1)]
|
||||
right = [ (step ** i) - 1 for i in range(1, reg_max // 2)]
|
||||
vals = [-ub2] + left + [0] + right + [ub2]
|
||||
return torch.tensor(vals, dtype=up.dtype, device=up.device)
|
||||
|
||||
|
||||
def distance2bbox(points, distance, reg_scale):
|
||||
"""Decode edge-distances → cxcywh boxes."""
|
||||
rs = abs(reg_scale).to(dtype=points.dtype)
|
||||
x1 = points[..., 0] - (0.5 * rs + distance[..., 0]) * (points[..., 2] / rs)
|
||||
y1 = points[..., 1] - (0.5 * rs + distance[..., 1]) * (points[..., 3] / rs)
|
||||
x2 = points[..., 0] + (0.5 * rs + distance[..., 2]) * (points[..., 2] / rs)
|
||||
y2 = points[..., 1] + (0.5 * rs + distance[..., 3]) * (points[..., 3] / rs)
|
||||
x0, y0, x1_, y1_ = (x1 + x2) / 2, (y1 + y2) / 2, x2 - x1, y2 - y1
|
||||
return torch.stack([x0, y0, x1_, y1_], -1)
|
||||
|
||||
|
||||
class Integral(nn.Module):
|
||||
"""Sum Pr(n)·W(n) over the distribution bins."""
|
||||
def __init__(self, reg_max=32):
|
||||
super().__init__()
|
||||
self.reg_max = reg_max
|
||||
|
||||
def forward(self, x, project):
|
||||
shape = x.shape
|
||||
x = F.softmax(x.reshape(-1, self.reg_max + 1), 1)
|
||||
x = F.linear(x, project.to(device=x.device, dtype=x.dtype)).reshape(-1, 4)
|
||||
return x.reshape(list(shape[:-1]) + [-1])
|
||||
|
||||
|
||||
class LQE(nn.Module):
|
||||
"""Location Quality Estimator — refines class scores using corner distribution."""
|
||||
def __init__(self, k=4, hidden_dim=64, num_layers=2, reg_max=32, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.k, self.reg_max = k, reg_max
|
||||
self.reg_conf = MLP(4 * (k + 1), hidden_dim, 1, num_layers, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
def forward(self, scores, pred_corners):
|
||||
B, L, _ = pred_corners.shape
|
||||
prob = F.softmax(pred_corners.reshape(B, L, 4, self.reg_max + 1), -1)
|
||||
topk, _ = prob.topk(self.k, -1)
|
||||
stat = torch.cat([topk, topk.mean(-1, keepdim=True)], -1)
|
||||
return scores + self.reg_conf(stat.reshape(B, L, -1))
|
||||
|
||||
|
||||
class TransformerDecoder(nn.Module):
|
||||
def __init__(self, hidden_dim, nhead, dim_feedforward, num_levels, num_points, num_layers, reg_max, reg_scale, up, eval_idx=-1, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.hidden_dim = hidden_dim
|
||||
self.num_layers = num_layers
|
||||
self.nhead = nhead
|
||||
self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
|
||||
self.up, self.reg_scale, self.reg_max = up, reg_scale, reg_max
|
||||
self.layers = nn.ModuleList([
|
||||
TransformerDecoderLayer(hidden_dim, nhead, dim_feedforward, num_levels, num_points, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(self.eval_idx + 1)
|
||||
])
|
||||
self.lqe_layers = nn.ModuleList([LQE(4, 64, 2, reg_max, device=device, dtype=dtype, operations=operations) for _ in range(self.eval_idx + 1)])
|
||||
self.register_buffer('project', weighting_function(reg_max, up, reg_scale))
|
||||
|
||||
def _value_op(self, memory, spatial_shapes):
|
||||
"""Reshape memory to per-level value tensors for deformable attention."""
|
||||
c = self.hidden_dim // self.nhead
|
||||
split = [h * w for h, w in spatial_shapes]
|
||||
val = memory.reshape(memory.shape[0], memory.shape[1], self.nhead, c) # memory: [bs, sum(h*w), hidden_dim]
|
||||
# → [bs, n_head, c, sum_hw]
|
||||
val = val.permute(0, 2, 3, 1).flatten(0, 1) # [bs*n_head, c, sum_hw]
|
||||
return val.split(split, dim=-1) # list of [bs*n_head, c, h_l*w_l]
|
||||
|
||||
def forward(self, target, ref_pts_unact, memory, spatial_shapes, bbox_head, score_head, query_pos_head, pre_bbox_head, integral):
|
||||
val_split_flat = self._value_op(memory, spatial_shapes) # pre-split value for deformable attention
|
||||
|
||||
# reshape to [bs*n_head, c, h_l, w_l]
|
||||
value = []
|
||||
for lvl, (h, w) in enumerate(spatial_shapes):
|
||||
v = val_split_flat[lvl] # [bs*n_head, c, h*w]
|
||||
value.append(v.reshape(v.shape[0], v.shape[1], h, w))
|
||||
|
||||
ref_pts = F.sigmoid(ref_pts_unact)
|
||||
output = target
|
||||
output_detach = pred_corners_undetach = 0
|
||||
|
||||
dec_bboxes, dec_logits = [], []
|
||||
|
||||
for i, layer in enumerate(self.layers):
|
||||
ref_input = ref_pts.unsqueeze(2) # [bs, Lq, 1, 4]
|
||||
query_pos = query_pos_head(ref_pts).clamp(-10, 10)
|
||||
output = layer(output, ref_input, value, spatial_shapes, query_pos=query_pos)
|
||||
|
||||
if i == 0:
|
||||
ref_unact = ref_pts.clamp(1e-5, 1 - 1e-5)
|
||||
ref_unact = torch.log(ref_unact / (1 - ref_unact))
|
||||
pre_bboxes = F.sigmoid(pre_bbox_head(output) + ref_unact)
|
||||
ref_pts_initial = pre_bboxes.detach()
|
||||
|
||||
pred_corners = bbox_head[i](output + output_detach) + pred_corners_undetach
|
||||
inter_ref_bbox = distance2bbox(ref_pts_initial, integral(pred_corners, self.project), self.reg_scale)
|
||||
|
||||
if i == self.eval_idx:
|
||||
scores = score_head[i](output)
|
||||
scores = self.lqe_layers[i](scores, pred_corners)
|
||||
dec_bboxes.append(inter_ref_bbox)
|
||||
dec_logits.append(scores)
|
||||
break
|
||||
|
||||
pred_corners_undetach = pred_corners
|
||||
ref_pts = inter_ref_bbox.detach()
|
||||
output_detach = output.detach()
|
||||
|
||||
return torch.stack(dec_bboxes), torch.stack(dec_logits)
|
||||
|
||||
|
||||
class DFINETransformer(nn.Module):
|
||||
def __init__(self, num_classes=80, hidden_dim=256, num_queries=300, feat_channels=[256, 256, 256], feat_strides=[8, 16, 32],
|
||||
num_levels=3, num_points=[3, 6, 3], nhead=8, num_layers=6, dim_feedforward=1024, eval_idx=-1, eps=1e-2, reg_max=32,
|
||||
reg_scale=8.0, eval_spatial_size=(640, 640), device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
assert len(feat_strides) == len(feat_channels)
|
||||
self.hidden_dim = hidden_dim
|
||||
self.num_queries = num_queries
|
||||
self.num_levels = num_levels
|
||||
self.eps = eps
|
||||
self.eval_spatial_size = eval_spatial_size
|
||||
|
||||
self.feat_strides = list(feat_strides)
|
||||
for i in range(num_levels - len(feat_strides)):
|
||||
self.feat_strides.append(feat_strides[-1] * 2 ** (i + 1))
|
||||
|
||||
# input projection (expects pre-fused weights)
|
||||
self.input_proj = nn.ModuleList()
|
||||
for ch in feat_channels:
|
||||
if ch == hidden_dim:
|
||||
self.input_proj.append(nn.Identity())
|
||||
else:
|
||||
self.input_proj.append(nn.Sequential(OrderedDict([
|
||||
('conv', operations.Conv2d(ch, hidden_dim, 1, bias=True, device=device, dtype=dtype))])))
|
||||
in_ch = feat_channels[-1]
|
||||
for i in range(num_levels - len(feat_channels)):
|
||||
self.input_proj.append(nn.Sequential(OrderedDict([
|
||||
('conv', operations.Conv2d(in_ch if i == 0 else hidden_dim,
|
||||
hidden_dim, 3, 2, 1, bias=True, device=device, dtype=dtype))])))
|
||||
in_ch = hidden_dim
|
||||
|
||||
# FDR parameters (non-trainable placeholders, set from config)
|
||||
self.up = nn.Parameter(torch.tensor([0.5]), requires_grad=False)
|
||||
self.reg_scale = nn.Parameter(torch.tensor([reg_scale]), requires_grad=False)
|
||||
|
||||
pts = num_points if isinstance(num_points, (list, tuple)) else [num_points] * num_levels
|
||||
self.decoder = TransformerDecoder(hidden_dim, nhead, dim_feedforward, num_levels, pts,
|
||||
num_layers, reg_max, self.reg_scale, self.up, eval_idx, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, 2, device=device, dtype=dtype, operations=operations)
|
||||
self.enc_output = nn.Sequential(OrderedDict([
|
||||
('proj', operations.Linear(hidden_dim, hidden_dim, device=device, dtype=dtype)),
|
||||
('norm', operations.LayerNorm(hidden_dim, device=device, dtype=dtype))]))
|
||||
self.enc_score_head = operations.Linear(hidden_dim, num_classes, device=device, dtype=dtype)
|
||||
self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, 3, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
self.eval_idx_ = eval_idx if eval_idx >= 0 else num_layers + eval_idx
|
||||
self.dec_score_head = nn.ModuleList(
|
||||
[operations.Linear(hidden_dim, num_classes, device=device, dtype=dtype) for _ in range(self.eval_idx_ + 1)])
|
||||
self.pre_bbox_head = MLP(hidden_dim, hidden_dim, 4, 3, device=device, dtype=dtype, operations=operations)
|
||||
self.dec_bbox_head = nn.ModuleList(
|
||||
[MLP(hidden_dim, hidden_dim, 4 * (reg_max + 1), 3, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(self.eval_idx_ + 1)])
|
||||
self.integral = Integral(reg_max)
|
||||
|
||||
if eval_spatial_size:
|
||||
# Register as buffers so checkpoint values override the freshly-computed defaults
|
||||
anchors, valid_mask = self._gen_anchors()
|
||||
self.register_buffer('anchors', anchors)
|
||||
self.register_buffer('valid_mask', valid_mask)
|
||||
|
||||
def _gen_anchors(self, spatial_shapes=None, grid_size=0.05, dtype=torch.float32, device='cpu'):
|
||||
if spatial_shapes is None:
|
||||
h0, w0 = self.eval_spatial_size
|
||||
spatial_shapes = [[int(h0 / s), int(w0 / s)] for s in self.feat_strides]
|
||||
anchors = []
|
||||
for lvl, (h, w) in enumerate(spatial_shapes):
|
||||
gy, gx = torch.meshgrid(torch.arange(h), torch.arange(w), indexing='ij')
|
||||
gxy = (torch.stack([gx, gy], -1).float() + 0.5) / torch.tensor([w, h], dtype=dtype)
|
||||
wh = torch.ones_like(gxy) * grid_size * (2. ** lvl)
|
||||
anchors.append(torch.cat([gxy, wh], -1).reshape(-1, h * w, 4))
|
||||
anchors = torch.cat(anchors, 1).to(device)
|
||||
valid_mask = ((anchors > self.eps) & (anchors < 1 - self.eps)).all(-1, keepdim=True)
|
||||
anchors = torch.log(anchors / (1 - anchors))
|
||||
anchors = torch.where(valid_mask, anchors, torch.full_like(anchors, float('inf')))
|
||||
return anchors, valid_mask
|
||||
|
||||
def _encoder_input(self, feats: List[torch.Tensor]):
|
||||
proj = [self.input_proj[i](f) for i, f in enumerate(feats)]
|
||||
for i in range(len(feats), self.num_levels):
|
||||
proj.append(self.input_proj[i](feats[-1] if i == len(feats) else proj[-1]))
|
||||
flat, shapes = [], []
|
||||
for f in proj:
|
||||
_, _, h, w = f.shape
|
||||
flat.append(f.flatten(2).permute(0, 2, 1))
|
||||
shapes.append([h, w])
|
||||
return torch.cat(flat, 1), shapes
|
||||
|
||||
def _decoder_input(self, memory: torch.Tensor):
|
||||
anchors, valid_mask = self.anchors.to(memory), self.valid_mask
|
||||
if memory.shape[0] > 1:
|
||||
anchors = anchors.repeat(memory.shape[0], 1, 1)
|
||||
|
||||
mem = valid_mask.to(memory) * memory
|
||||
out_mem = self.enc_output(mem)
|
||||
logits = self.enc_score_head(out_mem)
|
||||
_, idx = torch.topk(logits.max(-1).values, self.num_queries, dim=-1)
|
||||
idx_e = idx.unsqueeze(-1)
|
||||
topk_mem = out_mem.gather(1, idx_e.expand(-1, -1, out_mem.shape[-1]))
|
||||
topk_anc = anchors.gather(1, idx_e.expand(-1, -1, anchors.shape[-1]))
|
||||
topk_ref = self.enc_bbox_head(topk_mem) + topk_anc
|
||||
return topk_mem.detach(), topk_ref.detach()
|
||||
|
||||
def forward(self, feats: List[torch.Tensor]):
|
||||
memory, shapes = self._encoder_input(feats)
|
||||
content, ref = self._decoder_input(memory)
|
||||
out_bboxes, out_logits = self.decoder(
|
||||
content, ref, memory, shapes,
|
||||
self.dec_bbox_head, self.dec_score_head,
|
||||
self.query_pos_head, self.pre_bbox_head, self.integral)
|
||||
return {'pred_logits': out_logits[-1], 'pred_boxes': out_bboxes[-1]}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main model
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class RTv4(nn.Module):
|
||||
def __init__(self, num_classes=80, num_queries=300, enc_h=256, dec_h=256, enc_ff=2048, dec_ff=1024, feat_strides=[8, 16, 32], device=None, dtype=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
self.device = device
|
||||
self.dtype = dtype
|
||||
self.operations = operations
|
||||
|
||||
self.backbone = HGNetv2(device=device, dtype=dtype, operations=operations)
|
||||
self.encoder = HybridEncoder(hidden_dim=enc_h, dim_feedforward=enc_ff, device=device, dtype=dtype, operations=operations)
|
||||
self.decoder = DFINETransformer(num_classes=num_classes, hidden_dim=dec_h, num_queries=num_queries,
|
||||
feat_channels=[enc_h] * len(feat_strides), feat_strides=feat_strides, dim_feedforward=dec_ff, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
self.num_classes = num_classes
|
||||
self.num_queries = num_queries
|
||||
self.load_device = comfy.model_management.get_torch_device()
|
||||
|
||||
def _forward(self, x: torch.Tensor):
|
||||
return self.decoder(self.encoder(self.backbone(x)))
|
||||
|
||||
def postprocess(self, outputs, orig_size: tuple = (640, 640)) -> List[dict]:
|
||||
logits = outputs['pred_logits']
|
||||
boxes = torchvision.ops.box_convert(outputs['pred_boxes'], 'cxcywh', 'xyxy')
|
||||
boxes = boxes * torch.tensor(orig_size, device=boxes.device, dtype=boxes.dtype).repeat(1, 2).unsqueeze(1)
|
||||
scores = F.sigmoid(logits)
|
||||
scores, idx = torch.topk(scores.flatten(1), self.num_queries, dim=-1)
|
||||
labels = idx % self.num_classes
|
||||
boxes = boxes.gather(1, (idx // self.num_classes).unsqueeze(-1).expand(-1, -1, 4))
|
||||
return [{'labels': lbl, 'boxes': b, 'scores': s} for lbl, b, s in zip(labels, boxes, scores)]
|
||||
|
||||
def forward(self, x: torch.Tensor, orig_size: tuple = (640, 640), **kwargs):
|
||||
outputs = self._forward(x.to(device=self.load_device, dtype=self.dtype))
|
||||
return self.postprocess(outputs, orig_size)
|
||||
|
|
@ -0,0 +1,596 @@
|
|||
# SAM3 detector: transformer encoder-decoder, segmentation head, geometry encoder, scoring.
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torchvision.ops import roi_align
|
||||
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
from comfy.ldm.sam3.tracker import SAM3Tracker, SAM31Tracker
|
||||
from comfy.ldm.sam3.sam import SAM3VisionBackbone # noqa: used in __init__
|
||||
from comfy.ldm.sam3.sam import MLP, PositionEmbeddingSine
|
||||
|
||||
TRACKER_CLASSES = {"SAM3": SAM3Tracker, "SAM31": SAM31Tracker}
|
||||
from comfy.ops import cast_to_input
|
||||
|
||||
|
||||
def box_cxcywh_to_xyxy(x):
|
||||
cx, cy, w, h = x.unbind(-1)
|
||||
return torch.stack([cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h], dim=-1)
|
||||
|
||||
|
||||
def gen_sineembed_for_position(pos_tensor, num_feats=256):
|
||||
"""Per-coordinate sinusoidal embedding: (..., N) -> (..., N * num_feats)."""
|
||||
assert num_feats % 2 == 0
|
||||
hdim = num_feats // 2
|
||||
freqs = 10000.0 ** (2 * (torch.arange(hdim, dtype=torch.float32, device=pos_tensor.device) // 2) / hdim)
|
||||
embeds = []
|
||||
for c in range(pos_tensor.shape[-1]):
|
||||
raw = (pos_tensor[..., c].float() * 2 * math.pi).unsqueeze(-1) / freqs
|
||||
embeds.append(torch.stack([raw[..., 0::2].sin(), raw[..., 1::2].cos()], dim=-1).flatten(-2))
|
||||
return torch.cat(embeds, dim=-1).to(pos_tensor.dtype)
|
||||
|
||||
|
||||
class SplitMHA(nn.Module):
|
||||
"""Multi-head attention with separate Q/K/V projections (split from fused in_proj_weight)."""
|
||||
def __init__(self, d_model, num_heads=8, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
self.q_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
self.k_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
self.v_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
self.out_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, q_input, k_input=None, v_input=None, mask=None):
|
||||
q = self.q_proj(q_input)
|
||||
if k_input is None:
|
||||
k = self.k_proj(q_input)
|
||||
v = self.v_proj(q_input)
|
||||
else:
|
||||
k = self.k_proj(k_input)
|
||||
v = self.v_proj(v_input if v_input is not None else k_input)
|
||||
if mask is not None and mask.ndim == 2:
|
||||
mask = mask[:, None, None, :] # [B, T] -> [B, 1, 1, T] for SDPA broadcast
|
||||
dtype = q.dtype # manual_cast may produce mixed dtypes
|
||||
out = optimized_attention(q, k.to(dtype), v.to(dtype), self.num_heads, mask=mask, low_precision_attention=False)
|
||||
return self.out_proj(out)
|
||||
|
||||
|
||||
class MLPWithNorm(nn.Module):
|
||||
"""MLP with residual connection and output LayerNorm."""
|
||||
def __init__(self, input_dim, hidden_dim, output_dim, num_layers, residual=True, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
dims = [input_dim] + [hidden_dim] * (num_layers - 1) + [output_dim]
|
||||
self.layers = nn.ModuleList([
|
||||
operations.Linear(dims[i], dims[i + 1], device=device, dtype=dtype)
|
||||
for i in range(num_layers)
|
||||
])
|
||||
self.out_norm = operations.LayerNorm(output_dim, device=device, dtype=dtype)
|
||||
self.residual = residual and (input_dim == output_dim)
|
||||
|
||||
def forward(self, x):
|
||||
orig = x
|
||||
for i, layer in enumerate(self.layers):
|
||||
x = layer(x)
|
||||
if i < len(self.layers) - 1:
|
||||
x = F.relu(x)
|
||||
if self.residual:
|
||||
x = x + orig
|
||||
return self.out_norm(x)
|
||||
|
||||
|
||||
class EncoderLayer(nn.Module):
|
||||
def __init__(self, d_model=256, num_heads=8, dim_ff=2048, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.self_attn = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
|
||||
self.cross_attn_image = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
|
||||
self.linear1 = operations.Linear(d_model, dim_ff, device=device, dtype=dtype)
|
||||
self.linear2 = operations.Linear(dim_ff, d_model, device=device, dtype=dtype)
|
||||
self.norm1 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.norm2 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.norm3 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, pos, text_memory=None, text_mask=None):
|
||||
normed = self.norm1(x)
|
||||
q_k = normed + pos
|
||||
x = x + self.self_attn(q_k, q_k, normed)
|
||||
if text_memory is not None:
|
||||
normed = self.norm2(x)
|
||||
x = x + self.cross_attn_image(normed, text_memory, text_memory, mask=text_mask)
|
||||
normed = self.norm3(x)
|
||||
x = x + self.linear2(F.relu(self.linear1(normed)))
|
||||
return x
|
||||
|
||||
|
||||
class TransformerEncoder(nn.Module):
|
||||
"""Checkpoint: transformer.encoder.layers.N.*"""
|
||||
def __init__(self, d_model=256, num_heads=8, dim_ff=2048, num_layers=6, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.layers = nn.ModuleList([
|
||||
EncoderLayer(d_model, num_heads, dim_ff, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
def forward(self, x, pos, text_memory=None, text_mask=None):
|
||||
for layer in self.layers:
|
||||
x = layer(x, pos, text_memory, text_mask)
|
||||
return x
|
||||
|
||||
|
||||
class DecoderLayer(nn.Module):
|
||||
def __init__(self, d_model=256, num_heads=8, dim_ff=2048, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.self_attn = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
|
||||
self.cross_attn = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
|
||||
self.ca_text = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
|
||||
self.norm1 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.norm2 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.norm3 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.catext_norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.linear1 = operations.Linear(d_model, dim_ff, device=device, dtype=dtype)
|
||||
self.linear2 = operations.Linear(dim_ff, d_model, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, memory, x_pos, memory_pos, text_memory=None, text_mask=None, cross_attn_bias=None):
|
||||
q_k = x + x_pos
|
||||
x = self.norm2(x + self.self_attn(q_k, q_k, x))
|
||||
if text_memory is not None:
|
||||
x = self.catext_norm(x + self.ca_text(x + x_pos, text_memory, text_memory, mask=text_mask))
|
||||
x = self.norm1(x + self.cross_attn(x + x_pos, memory + memory_pos, memory, mask=cross_attn_bias))
|
||||
x = self.norm3(x + self.linear2(F.relu(self.linear1(x))))
|
||||
return x
|
||||
|
||||
|
||||
class TransformerDecoder(nn.Module):
|
||||
def __init__(self, d_model=256, num_heads=8, dim_ff=2048, num_layers=6,
|
||||
num_queries=200, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.d_model = d_model
|
||||
self.num_queries = num_queries
|
||||
|
||||
self.layers = nn.ModuleList([
|
||||
DecoderLayer(d_model, num_heads, dim_ff, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
self.norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.query_embed = operations.Embedding(num_queries, d_model, device=device, dtype=dtype)
|
||||
self.reference_points = operations.Embedding(num_queries, 4, device=device, dtype=dtype) # Reference points: Embedding(num_queries, 4) — learned anchor boxes
|
||||
self.ref_point_head = MLP(d_model * 2, d_model, d_model, 2, device=device, dtype=dtype, operations=operations) # ref_point_head input: 512 (4 coords * 128 sine features each)
|
||||
self.bbox_embed = MLP(d_model, d_model, 4, 3, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
self.boxRPB_embed_x = MLP(2, d_model, num_heads, 2, device=device, dtype=dtype, operations=operations)
|
||||
self.boxRPB_embed_y = MLP(2, d_model, num_heads, 2, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
self.presence_token = operations.Embedding(1, d_model, device=device, dtype=dtype)
|
||||
self.presence_token_head = MLP(d_model, d_model, 1, 3, device=device, dtype=dtype, operations=operations)
|
||||
self.presence_token_out_norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
|
||||
@staticmethod
|
||||
def _inverse_sigmoid(x):
|
||||
return torch.log(x / (1 - x + 1e-6) + 1e-6)
|
||||
|
||||
def _compute_box_rpb(self, ref_points, H, W):
|
||||
"""Box rotary position bias: (B, Q, 4) cxcywh -> (B, n_heads, Q+1, H*W) bias."""
|
||||
boxes_xyxy = box_cxcywh_to_xyxy(ref_points)
|
||||
B, Q, _ = boxes_xyxy.shape
|
||||
coords_h = torch.arange(H, device=ref_points.device, dtype=torch.float32) / H
|
||||
coords_w = torch.arange(W, device=ref_points.device, dtype=torch.float32) / W
|
||||
deltas_x = coords_w.view(1, 1, -1, 1) - boxes_xyxy[:, :, None, 0:3:2]
|
||||
deltas_y = coords_h.view(1, 1, -1, 1) - boxes_xyxy[:, :, None, 1:4:2]
|
||||
|
||||
log2_8 = float(math.log2(8))
|
||||
def log_scale(d):
|
||||
return torch.sign(d * 8) * torch.log2(torch.abs(d * 8) + 1.0) / log2_8
|
||||
|
||||
rpb_x = self.boxRPB_embed_x(log_scale(deltas_x).to(ref_points.dtype))
|
||||
rpb_y = self.boxRPB_embed_y(log_scale(deltas_y).to(ref_points.dtype))
|
||||
|
||||
bias = (rpb_y.unsqueeze(3) + rpb_x.unsqueeze(2)).flatten(2, 3).permute(0, 3, 1, 2)
|
||||
pres_bias = torch.zeros(B, bias.shape[1], 1, bias.shape[3], device=bias.device, dtype=bias.dtype)
|
||||
return torch.cat([pres_bias, bias], dim=2)
|
||||
|
||||
def forward(self, memory, memory_pos, text_memory=None, text_mask=None, H=72, W=72):
|
||||
B = memory.shape[0]
|
||||
tgt = cast_to_input(self.query_embed.weight, memory).unsqueeze(0).expand(B, -1, -1)
|
||||
presence_out = cast_to_input(self.presence_token.weight, memory)[None].expand(B, -1, -1)
|
||||
ref_points = cast_to_input(self.reference_points.weight, memory).unsqueeze(0).expand(B, -1, -1).sigmoid()
|
||||
|
||||
for layer_idx, layer in enumerate(self.layers):
|
||||
query_pos = self.ref_point_head(gen_sineembed_for_position(ref_points, self.d_model))
|
||||
tgt_with_pres = torch.cat([presence_out, tgt], dim=1)
|
||||
pos_with_pres = torch.cat([torch.zeros_like(presence_out), query_pos], dim=1)
|
||||
tgt_with_pres = layer(tgt_with_pres, memory, pos_with_pres, memory_pos,
|
||||
text_memory, text_mask, self._compute_box_rpb(ref_points, H, W))
|
||||
presence_out, tgt = tgt_with_pres[:, :1], tgt_with_pres[:, 1:]
|
||||
if layer_idx < len(self.layers) - 1:
|
||||
ref_inv = self._inverse_sigmoid(ref_points)
|
||||
ref_points = (ref_inv + self.bbox_embed(self.norm(tgt))).sigmoid().detach()
|
||||
|
||||
query_out = self.norm(tgt)
|
||||
ref_inv = self._inverse_sigmoid(ref_points)
|
||||
boxes = (ref_inv + self.bbox_embed(query_out)).sigmoid()
|
||||
presence = self.presence_token_head(self.presence_token_out_norm(presence_out)).squeeze(-1)
|
||||
return {"decoder_output": query_out, "pred_boxes": boxes, "presence": presence}
|
||||
|
||||
|
||||
class Transformer(nn.Module):
|
||||
def __init__(self, d_model=256, num_heads=8, dim_ff=2048, enc_layers=6, dec_layers=6,
|
||||
num_queries=200, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.encoder = TransformerEncoder(d_model, num_heads, dim_ff, enc_layers, device=device, dtype=dtype, operations=operations)
|
||||
self.decoder = TransformerDecoder(d_model, num_heads, dim_ff, dec_layers, num_queries, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
|
||||
class GeometryEncoder(nn.Module):
|
||||
def __init__(self, d_model=256, num_heads=8, num_layers=3, roi_size=7, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.d_model = d_model
|
||||
self.roi_size = roi_size
|
||||
self.pos_enc = PositionEmbeddingSine(num_pos_feats=d_model, normalize=True)
|
||||
self.points_direct_project = operations.Linear(2, d_model, device=device, dtype=dtype)
|
||||
self.points_pool_project = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
self.points_pos_enc_project = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
self.boxes_direct_project = operations.Linear(4, d_model, device=device, dtype=dtype)
|
||||
self.boxes_pool_project = operations.Conv2d(d_model, d_model, kernel_size=roi_size, device=device, dtype=dtype)
|
||||
self.boxes_pos_enc_project = operations.Linear(d_model + 2, d_model, device=device, dtype=dtype)
|
||||
self.label_embed = operations.Embedding(2, d_model, device=device, dtype=dtype)
|
||||
self.cls_embed = operations.Embedding(1, d_model, device=device, dtype=dtype)
|
||||
self.norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.img_pre_norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.encode = nn.ModuleList([
|
||||
EncoderLayer(d_model, num_heads, 2048, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
self.encode_norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.final_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
|
||||
def _encode_points(self, coords, labels, img_feat_2d):
|
||||
"""Encode point prompts: direct + pool + pos_enc + label. coords: [B, N, 2] normalized."""
|
||||
B, N, _ = coords.shape
|
||||
embed = self.points_direct_project(coords)
|
||||
# Pool features from backbone at point locations via grid_sample
|
||||
grid = (coords * 2 - 1).unsqueeze(2) # [B, N, 1, 2] in [-1, 1]
|
||||
sampled = F.grid_sample(img_feat_2d, grid, align_corners=False) # [B, C, N, 1]
|
||||
embed = embed + self.points_pool_project(sampled.squeeze(-1).permute(0, 2, 1)) # [B, N, C]
|
||||
# Positional encoding of coordinates
|
||||
x, y = coords[:, :, 0], coords[:, :, 1] # [B, N]
|
||||
pos_x, pos_y = self.pos_enc._encode_xy(x.flatten(), y.flatten())
|
||||
enc = torch.cat([pos_x, pos_y], dim=-1).view(B, N, -1)
|
||||
embed = embed + self.points_pos_enc_project(cast_to_input(enc, embed))
|
||||
embed = embed + cast_to_input(self.label_embed(labels.long()), embed)
|
||||
return embed
|
||||
|
||||
def _encode_boxes(self, boxes, labels, img_feat_2d):
|
||||
"""Encode box prompts: direct + pool + pos_enc + label. boxes: [B, N, 4] normalized cxcywh."""
|
||||
B, N, _ = boxes.shape
|
||||
embed = self.boxes_direct_project(boxes)
|
||||
# ROI align from backbone at box regions
|
||||
H, W = img_feat_2d.shape[-2:]
|
||||
boxes_xyxy = box_cxcywh_to_xyxy(boxes)
|
||||
scale = torch.tensor([W, H, W, H], dtype=boxes_xyxy.dtype, device=boxes_xyxy.device)
|
||||
boxes_scaled = boxes_xyxy * scale
|
||||
sampled = roi_align(img_feat_2d, boxes_scaled.view(-1, 4).split(N), self.roi_size)
|
||||
proj = self.boxes_pool_project(sampled).view(B, N, -1) # Conv2d(roi_size) -> [B*N, C, 1, 1] -> [B, N, C]
|
||||
embed = embed + proj
|
||||
# Positional encoding of box center + size
|
||||
cx, cy, w, h = boxes[:, :, 0], boxes[:, :, 1], boxes[:, :, 2], boxes[:, :, 3]
|
||||
enc = self.pos_enc.encode_boxes(cx.flatten(), cy.flatten(), w.flatten(), h.flatten())
|
||||
enc = enc.view(B, N, -1)
|
||||
embed = embed + self.boxes_pos_enc_project(cast_to_input(enc, embed))
|
||||
embed = embed + cast_to_input(self.label_embed(labels.long()), embed)
|
||||
return embed
|
||||
|
||||
def forward(self, points=None, boxes=None, image_features=None):
|
||||
"""Encode geometry prompts. image_features: [B, HW, C] flattened backbone features."""
|
||||
# Prepare 2D image features for pooling
|
||||
img_feat_2d = None
|
||||
if image_features is not None:
|
||||
B = image_features.shape[0]
|
||||
HW, C = image_features.shape[1], image_features.shape[2]
|
||||
hw = int(math.sqrt(HW))
|
||||
img_normed = self.img_pre_norm(image_features)
|
||||
img_feat_2d = img_normed.permute(0, 2, 1).view(B, C, hw, hw)
|
||||
|
||||
embeddings = []
|
||||
if points is not None:
|
||||
coords, labels = points
|
||||
embeddings.append(self._encode_points(coords, labels, img_feat_2d))
|
||||
if boxes is not None:
|
||||
B = boxes.shape[0]
|
||||
box_labels = torch.ones(B, boxes.shape[1], dtype=torch.long, device=boxes.device)
|
||||
embeddings.append(self._encode_boxes(boxes, box_labels, img_feat_2d))
|
||||
if not embeddings:
|
||||
return None
|
||||
geo = torch.cat(embeddings, dim=1)
|
||||
geo = self.norm(geo)
|
||||
if image_features is not None:
|
||||
for layer in self.encode:
|
||||
geo = layer(geo, torch.zeros_like(geo), image_features)
|
||||
geo = self.encode_norm(geo)
|
||||
return self.final_proj(geo)
|
||||
|
||||
|
||||
class PixelDecoder(nn.Module):
|
||||
"""Top-down FPN pixel decoder with GroupNorm + ReLU + nearest interpolation."""
|
||||
def __init__(self, d_model=256, num_stages=3, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.conv_layers = nn.ModuleList([operations.Conv2d(d_model, d_model, kernel_size=3, padding=1, device=device, dtype=dtype) for _ in range(num_stages)])
|
||||
self.norms = nn.ModuleList([operations.GroupNorm(8, d_model, device=device, dtype=dtype) for _ in range(num_stages)])
|
||||
|
||||
def forward(self, backbone_features):
|
||||
prev = backbone_features[-1]
|
||||
for i, feat in enumerate(backbone_features[:-1][::-1]):
|
||||
prev = F.relu(self.norms[i](self.conv_layers[i](feat + F.interpolate(prev, size=feat.shape[-2:], mode="nearest"))))
|
||||
return prev
|
||||
|
||||
|
||||
class MaskPredictor(nn.Module):
|
||||
def __init__(self, d_model=256, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.mask_embed = MLP(d_model, d_model, d_model, 3, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
def forward(self, query_embeddings, pixel_features):
|
||||
mask_embed = self.mask_embed(query_embeddings)
|
||||
return torch.einsum("bqc,bchw->bqhw", mask_embed, pixel_features)
|
||||
|
||||
|
||||
class SegmentationHead(nn.Module):
|
||||
def __init__(self, d_model=256, num_heads=8, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.d_model = d_model
|
||||
self.pixel_decoder = PixelDecoder(d_model, 3, device=device, dtype=dtype, operations=operations)
|
||||
self.mask_predictor = MaskPredictor(d_model, device=device, dtype=dtype, operations=operations)
|
||||
self.cross_attend_prompt = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
|
||||
self.cross_attn_norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.instance_seg_head = operations.Conv2d(d_model, d_model, kernel_size=1, device=device, dtype=dtype)
|
||||
self.semantic_seg_head = operations.Conv2d(d_model, 1, kernel_size=1, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, query_embeddings, backbone_features, encoder_hidden_states=None, prompt=None, prompt_mask=None):
|
||||
if encoder_hidden_states is not None and prompt is not None:
|
||||
enc_normed = self.cross_attn_norm(encoder_hidden_states)
|
||||
enc_cross = self.cross_attend_prompt(enc_normed, prompt, prompt, mask=prompt_mask)
|
||||
encoder_hidden_states = enc_cross + encoder_hidden_states
|
||||
|
||||
if encoder_hidden_states is not None:
|
||||
B, H, W = encoder_hidden_states.shape[0], backbone_features[-1].shape[-2], backbone_features[-1].shape[-1]
|
||||
encoder_visual = encoder_hidden_states[:, :H * W].permute(0, 2, 1).view(B, self.d_model, H, W)
|
||||
backbone_features = list(backbone_features)
|
||||
backbone_features[-1] = encoder_visual
|
||||
|
||||
pixel_features = self.pixel_decoder(backbone_features)
|
||||
instance_features = self.instance_seg_head(pixel_features)
|
||||
masks = self.mask_predictor(query_embeddings, instance_features)
|
||||
return masks
|
||||
|
||||
|
||||
class DotProductScoring(nn.Module):
|
||||
def __init__(self, d_model=256, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.hs_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
self.prompt_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
self.prompt_mlp = MLPWithNorm(d_model, 2048, d_model, 2, device=device, dtype=dtype, operations=operations)
|
||||
self.scale = 1.0 / (d_model ** 0.5)
|
||||
|
||||
def forward(self, query_embeddings, prompt_embeddings, prompt_mask=None):
|
||||
prompt = self.prompt_mlp(prompt_embeddings)
|
||||
if prompt_mask is not None:
|
||||
weight = prompt_mask.unsqueeze(-1).to(dtype=prompt.dtype)
|
||||
pooled = (prompt * weight).sum(dim=1) / weight.sum(dim=1).clamp(min=1)
|
||||
else:
|
||||
pooled = prompt.mean(dim=1)
|
||||
hs = self.hs_proj(query_embeddings)
|
||||
pp = self.prompt_proj(pooled).unsqueeze(-1).to(hs.dtype)
|
||||
scores = torch.matmul(hs, pp)
|
||||
return (scores * self.scale).clamp(-12.0, 12.0).squeeze(-1)
|
||||
|
||||
|
||||
class SAM3Detector(nn.Module):
|
||||
def __init__(self, d_model=256, embed_dim=1024, num_queries=200, device=None, dtype=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
image_model = kwargs.pop("image_model", "SAM3")
|
||||
for k in ("num_heads", "num_head_channels"):
|
||||
kwargs.pop(k, None)
|
||||
multiplex = image_model == "SAM31"
|
||||
# SAM3: 4 FPN levels, drop last (scalp=1); SAM3.1: 3 levels, use all (scalp=0)
|
||||
self.scalp = 0 if multiplex else 1
|
||||
self.backbone = nn.ModuleDict({
|
||||
"vision_backbone": SAM3VisionBackbone(embed_dim=embed_dim, d_model=d_model, multiplex=multiplex, device=device, dtype=dtype, operations=operations, **kwargs),
|
||||
"language_backbone": nn.ModuleDict({"resizer": operations.Linear(embed_dim, d_model, device=device, dtype=dtype)}),
|
||||
})
|
||||
self.transformer = Transformer(d_model=d_model, num_queries=num_queries, device=device, dtype=dtype, operations=operations)
|
||||
self.segmentation_head = SegmentationHead(d_model=d_model, device=device, dtype=dtype, operations=operations)
|
||||
self.geometry_encoder = GeometryEncoder(d_model=d_model, device=device, dtype=dtype, operations=operations)
|
||||
self.dot_prod_scoring = DotProductScoring(d_model=d_model, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
def _get_backbone_features(self, images):
|
||||
"""Run backbone and return (detector_features, detector_positions, tracker_features, tracker_positions)."""
|
||||
bb = self.backbone["vision_backbone"]
|
||||
if bb.multiplex:
|
||||
all_f, all_p, tf, tp = bb(images, tracker_mode="propagation")
|
||||
else:
|
||||
all_f, all_p, tf, tp = bb(images, need_tracker=True)
|
||||
return all_f, all_p, tf, tp
|
||||
|
||||
@staticmethod
|
||||
def _run_geo_layer(layer, x, memory, memory_pos):
|
||||
x = x + layer.self_attn(layer.norm1(x))
|
||||
x = x + layer.cross_attn_image(layer.norm2(x), memory + memory_pos, memory)
|
||||
x = x + layer.linear2(F.relu(layer.linear1(layer.norm3(x))))
|
||||
return x
|
||||
|
||||
def _detect(self, features, positions, text_embeddings=None, text_mask=None,
|
||||
points=None, boxes=None):
|
||||
"""Shared detection: geometry encoding, transformer, scoring, segmentation."""
|
||||
B = features[0].shape[0]
|
||||
# Scalp for encoder (use top-level feature), but keep all levels for segmentation head
|
||||
seg_features = features
|
||||
if self.scalp > 0:
|
||||
features = features[:-self.scalp]
|
||||
positions = positions[:-self.scalp]
|
||||
enc_feat, enc_pos = features[-1], positions[-1]
|
||||
_, _, H, W = enc_feat.shape
|
||||
img_flat = enc_feat.flatten(2).permute(0, 2, 1)
|
||||
pos_flat = enc_pos.flatten(2).permute(0, 2, 1)
|
||||
|
||||
has_prompts = text_embeddings is not None or points is not None or boxes is not None
|
||||
if has_prompts:
|
||||
geo_enc = self.geometry_encoder
|
||||
geo_prompts = geo_enc(points=points, boxes=boxes, image_features=img_flat)
|
||||
geo_cls = geo_enc.norm(geo_enc.final_proj(cast_to_input(geo_enc.cls_embed.weight, img_flat).view(1, 1, -1).expand(B, -1, -1)))
|
||||
for layer in geo_enc.encode:
|
||||
geo_cls = self._run_geo_layer(layer, geo_cls, img_flat, pos_flat)
|
||||
geo_cls = geo_enc.encode_norm(geo_cls)
|
||||
if text_embeddings is not None and text_embeddings.shape[0] != B:
|
||||
text_embeddings = text_embeddings.expand(B, -1, -1)
|
||||
if text_mask is not None and text_mask.shape[0] != B:
|
||||
text_mask = text_mask.expand(B, -1)
|
||||
parts = [t for t in [text_embeddings, geo_prompts, geo_cls] if t is not None]
|
||||
text_embeddings = torch.cat(parts, dim=1)
|
||||
n_new = text_embeddings.shape[1] - (text_mask.shape[1] if text_mask is not None else 0)
|
||||
if text_mask is not None:
|
||||
text_mask = torch.cat([text_mask, torch.ones(B, n_new, dtype=torch.bool, device=text_mask.device)], dim=1)
|
||||
else:
|
||||
text_mask = torch.ones(B, text_embeddings.shape[1], dtype=torch.bool, device=text_embeddings.device)
|
||||
|
||||
memory = self.transformer.encoder(img_flat, pos_flat, text_embeddings, text_mask)
|
||||
dec_out = self.transformer.decoder(memory, pos_flat, text_embeddings, text_mask, H, W)
|
||||
query_out, pred_boxes = dec_out["decoder_output"], dec_out["pred_boxes"]
|
||||
|
||||
if text_embeddings is not None:
|
||||
scores = self.dot_prod_scoring(query_out, text_embeddings, text_mask)
|
||||
else:
|
||||
scores = torch.zeros(B, query_out.shape[1], device=query_out.device)
|
||||
|
||||
masks = self.segmentation_head(query_out, seg_features, encoder_hidden_states=memory, prompt=text_embeddings, prompt_mask=text_mask)
|
||||
return box_cxcywh_to_xyxy(pred_boxes), scores, masks, dec_out
|
||||
|
||||
def forward(self, images, text_embeddings=None, text_mask=None, points=None, boxes=None, threshold=0.3, orig_size=None):
|
||||
features, positions, _, _ = self._get_backbone_features(images)
|
||||
|
||||
if text_embeddings is not None:
|
||||
text_embeddings = self.backbone["language_backbone"]["resizer"](text_embeddings)
|
||||
if text_mask is not None:
|
||||
text_mask = text_mask.bool()
|
||||
|
||||
boxes_xyxy, scores, masks, dec_out = self._detect(
|
||||
features, positions, text_embeddings, text_mask, points, boxes)
|
||||
|
||||
if orig_size is not None:
|
||||
oh, ow = orig_size
|
||||
boxes_xyxy = boxes_xyxy * torch.tensor([ow, oh, ow, oh], device=boxes_xyxy.device, dtype=boxes_xyxy.dtype)
|
||||
masks = F.interpolate(masks, size=orig_size, mode="bilinear", align_corners=False)
|
||||
|
||||
return {
|
||||
"boxes": boxes_xyxy,
|
||||
"scores": scores,
|
||||
"masks": masks,
|
||||
"presence": dec_out.get("presence"),
|
||||
}
|
||||
|
||||
def forward_from_trunk(self, trunk_out, text_embeddings, text_mask):
|
||||
"""Run detection using a pre-computed ViTDet trunk output.
|
||||
|
||||
text_embeddings must already be resized through language_backbone.resizer.
|
||||
Returns dict with boxes (normalized xyxy), scores, masks at detector resolution.
|
||||
"""
|
||||
bb = self.backbone["vision_backbone"]
|
||||
features = [conv(trunk_out) for conv in bb.convs]
|
||||
positions = [cast_to_input(bb.position_encoding(f), f) for f in features]
|
||||
|
||||
if text_mask is not None:
|
||||
text_mask = text_mask.bool()
|
||||
|
||||
boxes_xyxy, scores, masks, _ = self._detect(features, positions, text_embeddings, text_mask)
|
||||
return {"boxes": boxes_xyxy, "scores": scores, "masks": masks}
|
||||
|
||||
|
||||
class SAM3Model(nn.Module):
|
||||
def __init__(self, device=None, dtype=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
self.dtype = dtype
|
||||
image_model = kwargs.get("image_model", "SAM3")
|
||||
tracker_cls = TRACKER_CLASSES[image_model]
|
||||
self.detector = SAM3Detector(device=device, dtype=dtype, operations=operations, **kwargs)
|
||||
self.tracker = tracker_cls(device=device, dtype=dtype, operations=operations, **kwargs)
|
||||
|
||||
def forward(self, images, **kwargs):
|
||||
return self.detector(images, **kwargs)
|
||||
|
||||
def forward_segment(self, images, point_inputs=None, box_inputs=None, mask_inputs=None):
|
||||
"""Interactive segmentation using SAM decoder with point/box/mask prompts.
|
||||
|
||||
Args:
|
||||
images: [B, 3, 1008, 1008] preprocessed images
|
||||
point_inputs: {"point_coords": [B, N, 2], "point_labels": [B, N]} in 1008x1008 pixel space
|
||||
box_inputs: [B, 2, 2] box corners (top-left, bottom-right) in 1008x1008 pixel space
|
||||
mask_inputs: [B, 1, H, W] coarse mask logits to refine
|
||||
Returns:
|
||||
[B, 1, image_size, image_size] high-res mask logits
|
||||
"""
|
||||
bb = self.detector.backbone["vision_backbone"]
|
||||
if bb.multiplex:
|
||||
_, _, tracker_features, tracker_positions = bb(images, tracker_mode="interactive")
|
||||
else:
|
||||
_, _, tracker_features, tracker_positions = bb(images, need_tracker=True)
|
||||
if self.detector.scalp > 0:
|
||||
tracker_features = tracker_features[:-self.detector.scalp]
|
||||
tracker_positions = tracker_positions[:-self.detector.scalp]
|
||||
|
||||
high_res = list(tracker_features[:-1])
|
||||
backbone_feat = tracker_features[-1]
|
||||
B, C, H, W = backbone_feat.shape
|
||||
# Add no-memory embedding (init frame path)
|
||||
no_mem = getattr(self.tracker, 'interactivity_no_mem_embed', None)
|
||||
if no_mem is None:
|
||||
no_mem = getattr(self.tracker, 'no_mem_embed', None)
|
||||
if no_mem is not None:
|
||||
feat_flat = backbone_feat.flatten(2).permute(0, 2, 1)
|
||||
feat_flat = feat_flat + cast_to_input(no_mem, feat_flat)
|
||||
backbone_feat = feat_flat.view(B, H, W, C).permute(0, 3, 1, 2)
|
||||
|
||||
num_pts = 0 if point_inputs is None else point_inputs["point_labels"].size(1)
|
||||
_, high_res_masks, _, _ = self.tracker._forward_sam_heads(
|
||||
backbone_features=backbone_feat,
|
||||
point_inputs=point_inputs,
|
||||
mask_inputs=mask_inputs,
|
||||
box_inputs=box_inputs,
|
||||
high_res_features=high_res,
|
||||
multimask_output=(0 < num_pts <= 1),
|
||||
)
|
||||
return high_res_masks
|
||||
|
||||
def forward_video(self, images, initial_masks, pbar=None, text_prompts=None,
|
||||
new_det_thresh=0.5, max_objects=0, detect_interval=1):
|
||||
"""Track video with optional per-frame text-prompted detection."""
|
||||
bb = self.detector.backbone["vision_backbone"]
|
||||
|
||||
def backbone_fn(frame, frame_idx=None):
|
||||
trunk_out = bb.trunk(frame)
|
||||
if bb.multiplex:
|
||||
_, _, tf, tp = bb(frame, tracker_mode="propagation", cached_trunk=trunk_out, tracker_only=True)
|
||||
else:
|
||||
_, _, tf, tp = bb(frame, need_tracker=True, cached_trunk=trunk_out, tracker_only=True)
|
||||
return tf, tp, trunk_out
|
||||
|
||||
detect_fn = None
|
||||
if text_prompts:
|
||||
resizer = self.detector.backbone["language_backbone"]["resizer"]
|
||||
resized = [(resizer(emb), m.bool() if m is not None else None) for emb, m in text_prompts]
|
||||
def detect_fn(trunk_out):
|
||||
all_scores, all_masks = [], []
|
||||
for emb, mask in resized:
|
||||
det = self.detector.forward_from_trunk(trunk_out, emb, mask)
|
||||
all_scores.append(det["scores"])
|
||||
all_masks.append(det["masks"])
|
||||
return {"scores": torch.cat(all_scores, dim=1), "masks": torch.cat(all_masks, dim=1)}
|
||||
|
||||
if hasattr(self.tracker, 'track_video_with_detection'):
|
||||
return self.tracker.track_video_with_detection(
|
||||
backbone_fn, images, initial_masks, detect_fn,
|
||||
new_det_thresh=new_det_thresh, max_objects=max_objects,
|
||||
detect_interval=detect_interval, backbone_obj=bb, pbar=pbar)
|
||||
# SAM3 (non-multiplex) — no detection support, requires initial masks
|
||||
if initial_masks is None:
|
||||
raise ValueError("SAM3 (non-multiplex) requires initial_mask for video tracking")
|
||||
return self.tracker.track_video(backbone_fn, images, initial_masks, pbar=pbar, backbone_obj=bb)
|
||||
|
|
@ -0,0 +1,425 @@
|
|||
# SAM3 shared components: primitives, ViTDet backbone, FPN neck, position encodings.
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
from comfy.ldm.flux.math import apply_rope
|
||||
from comfy.ldm.flux.layers import EmbedND
|
||||
from comfy.ops import cast_to_input
|
||||
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, input_dim, hidden_dim, output_dim, num_layers, sigmoid_output=False, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
dims = [input_dim] + [hidden_dim] * (num_layers - 1) + [output_dim]
|
||||
self.layers = nn.ModuleList([operations.Linear(dims[i], dims[i + 1], device=device, dtype=dtype) for i in range(num_layers)])
|
||||
self.sigmoid_output = sigmoid_output
|
||||
|
||||
def forward(self, x):
|
||||
for i, layer in enumerate(self.layers):
|
||||
x = F.relu(layer(x)) if i < len(self.layers) - 1 else layer(x)
|
||||
return torch.sigmoid(x) if self.sigmoid_output else x
|
||||
|
||||
|
||||
class SAMAttention(nn.Module):
|
||||
def __init__(self, embedding_dim, num_heads, downsample_rate=1, kv_in_dim=None, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
internal_dim = embedding_dim // downsample_rate
|
||||
kv_dim = kv_in_dim if kv_in_dim is not None else embedding_dim
|
||||
self.q_proj = operations.Linear(embedding_dim, internal_dim, device=device, dtype=dtype)
|
||||
self.k_proj = operations.Linear(kv_dim, internal_dim, device=device, dtype=dtype)
|
||||
self.v_proj = operations.Linear(kv_dim, internal_dim, device=device, dtype=dtype)
|
||||
self.out_proj = operations.Linear(internal_dim, embedding_dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, q, k, v):
|
||||
q = self.q_proj(q)
|
||||
k = self.k_proj(k)
|
||||
v = self.v_proj(v)
|
||||
return self.out_proj(optimized_attention(q, k, v, self.num_heads, low_precision_attention=False))
|
||||
|
||||
|
||||
class TwoWayAttentionBlock(nn.Module):
|
||||
def __init__(self, embedding_dim, num_heads, mlp_dim=2048, attention_downsample_rate=2, skip_first_layer_pe=False, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.skip_first_layer_pe = skip_first_layer_pe
|
||||
self.self_attn = SAMAttention(embedding_dim, num_heads, device=device, dtype=dtype, operations=operations)
|
||||
self.cross_attn_token_to_image = SAMAttention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate, device=device, dtype=dtype, operations=operations)
|
||||
self.cross_attn_image_to_token = SAMAttention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate, device=device, dtype=dtype, operations=operations)
|
||||
self.mlp = nn.Sequential(operations.Linear(embedding_dim, mlp_dim, device=device, dtype=dtype), nn.ReLU(), operations.Linear(mlp_dim, embedding_dim, device=device, dtype=dtype))
|
||||
self.norm1 = operations.LayerNorm(embedding_dim, device=device, dtype=dtype)
|
||||
self.norm2 = operations.LayerNorm(embedding_dim, device=device, dtype=dtype)
|
||||
self.norm3 = operations.LayerNorm(embedding_dim, device=device, dtype=dtype)
|
||||
self.norm4 = operations.LayerNorm(embedding_dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, queries, keys, query_pe, key_pe):
|
||||
if self.skip_first_layer_pe:
|
||||
queries = self.norm1(self.self_attn(queries, queries, queries))
|
||||
else:
|
||||
q = queries + query_pe
|
||||
queries = self.norm1(queries + self.self_attn(q, q, queries))
|
||||
q, k = queries + query_pe, keys + key_pe
|
||||
queries = self.norm2(queries + self.cross_attn_token_to_image(q, k, keys))
|
||||
queries = self.norm3(queries + self.mlp(queries))
|
||||
q, k = queries + query_pe, keys + key_pe
|
||||
keys = self.norm4(keys + self.cross_attn_image_to_token(k, q, queries))
|
||||
return queries, keys
|
||||
|
||||
|
||||
class TwoWayTransformer(nn.Module):
|
||||
def __init__(self, depth=2, embedding_dim=256, num_heads=8, mlp_dim=2048, attention_downsample_rate=2, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.layers = nn.ModuleList([
|
||||
TwoWayAttentionBlock(embedding_dim, num_heads, mlp_dim, attention_downsample_rate,
|
||||
skip_first_layer_pe=(i == 0), device=device, dtype=dtype, operations=operations)
|
||||
for i in range(depth)
|
||||
])
|
||||
self.final_attn_token_to_image = SAMAttention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate, device=device, dtype=dtype, operations=operations)
|
||||
self.norm_final = operations.LayerNorm(embedding_dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, image_embedding, image_pe, point_embedding):
|
||||
queries, keys = point_embedding, image_embedding
|
||||
for layer in self.layers:
|
||||
queries, keys = layer(queries, keys, point_embedding, image_pe)
|
||||
q, k = queries + point_embedding, keys + image_pe
|
||||
queries = self.norm_final(queries + self.final_attn_token_to_image(q, k, keys))
|
||||
return queries, keys
|
||||
|
||||
|
||||
class PositionEmbeddingRandom(nn.Module):
|
||||
"""Fourier feature positional encoding with random gaussian projection."""
|
||||
def __init__(self, num_pos_feats=64, scale=None):
|
||||
super().__init__()
|
||||
self.register_buffer("positional_encoding_gaussian_matrix", (scale or 1.0) * torch.randn(2, num_pos_feats))
|
||||
|
||||
def _encode(self, normalized_coords):
|
||||
"""Map normalized [0,1] coordinates to fourier features via random projection. Computes in fp32."""
|
||||
orig_dtype = normalized_coords.dtype
|
||||
proj_matrix = self.positional_encoding_gaussian_matrix.to(device=normalized_coords.device, dtype=torch.float32)
|
||||
projected = 2 * math.pi * (2 * normalized_coords.float() - 1) @ proj_matrix
|
||||
return torch.cat([projected.sin(), projected.cos()], dim=-1).to(orig_dtype)
|
||||
|
||||
def forward(self, size, device=None):
|
||||
h, w = size
|
||||
dev = device if device is not None else self.positional_encoding_gaussian_matrix.device
|
||||
ones = torch.ones((h, w), device=dev, dtype=torch.float32)
|
||||
norm_xy = torch.stack([(ones.cumsum(1) - 0.5) / w, (ones.cumsum(0) - 0.5) / h], dim=-1)
|
||||
return self._encode(norm_xy).permute(2, 0, 1).unsqueeze(0)
|
||||
|
||||
def forward_with_coords(self, pixel_coords, image_size):
|
||||
norm = pixel_coords.clone()
|
||||
norm[:, :, 0] /= image_size[1]
|
||||
norm[:, :, 1] /= image_size[0]
|
||||
return self._encode(norm)
|
||||
|
||||
|
||||
# ViTDet backbone + FPN neck
|
||||
|
||||
def window_partition(x: torch.Tensor, window_size: int):
|
||||
B, H, W, C = x.shape
|
||||
pad_h = (window_size - H % window_size) % window_size
|
||||
pad_w = (window_size - W % window_size) % window_size
|
||||
if pad_h > 0 or pad_w > 0:
|
||||
x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
|
||||
Hp, Wp = H + pad_h, W + pad_w
|
||||
x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
|
||||
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
|
||||
return windows, (Hp, Wp)
|
||||
|
||||
|
||||
def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw, hw):
|
||||
Hp, Wp = pad_hw
|
||||
H, W = hw
|
||||
B = windows.shape[0] // (Hp * Wp // window_size // window_size)
|
||||
x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
|
||||
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
|
||||
if Hp > H or Wp > W:
|
||||
x = x[:, :H, :W, :].contiguous()
|
||||
return x
|
||||
|
||||
|
||||
def rope_2d(end_x: int, end_y: int, dim: int, theta: float = 10000.0, scale_pos: float = 1.0):
|
||||
"""Generate 2D axial RoPE using flux EmbedND. Returns [1, 1, HW, dim//2, 2, 2]."""
|
||||
t = torch.arange(end_x * end_y, dtype=torch.float32)
|
||||
ids = torch.stack([(t % end_x) * scale_pos,
|
||||
torch.div(t, end_x, rounding_mode="floor") * scale_pos], dim=-1)
|
||||
return EmbedND(dim=dim, theta=theta, axes_dim=[dim // 2, dim // 2])(ids.unsqueeze(0))
|
||||
|
||||
|
||||
class _ViTMLP(nn.Module):
|
||||
def __init__(self, dim, mlp_ratio=4.0, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
hidden = int(dim * mlp_ratio)
|
||||
self.fc1 = operations.Linear(dim, hidden, device=device, dtype=dtype)
|
||||
self.act = nn.GELU()
|
||||
self.fc2 = operations.Linear(hidden, dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
return self.fc2(self.act(self.fc1(x)))
|
||||
|
||||
|
||||
class Attention(nn.Module):
|
||||
"""ViTDet multi-head attention with fused QKV projection."""
|
||||
|
||||
def __init__(self, dim, num_heads=8, qkv_bias=True, use_rope=False, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = dim // num_heads
|
||||
self.use_rope = use_rope
|
||||
self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, device=device, dtype=dtype)
|
||||
self.proj = operations.Linear(dim, dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, freqs_cis=None):
|
||||
B, N, C = x.shape
|
||||
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
|
||||
q, k, v = qkv.permute(2, 0, 3, 1, 4).unbind(dim=0)
|
||||
if self.use_rope and freqs_cis is not None:
|
||||
q, k = apply_rope(q, k, freqs_cis)
|
||||
return self.proj(optimized_attention(q, k, v, self.num_heads, skip_reshape=True, low_precision_attention=False))
|
||||
|
||||
|
||||
class Block(nn.Module):
|
||||
def __init__(self, dim, num_heads, mlp_ratio=4.0, qkv_bias=True, window_size=0, use_rope=False, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.window_size = window_size
|
||||
self.norm1 = operations.LayerNorm(dim, device=device, dtype=dtype)
|
||||
self.attn = Attention(dim, num_heads, qkv_bias, use_rope, device=device, dtype=dtype, operations=operations)
|
||||
self.norm2 = operations.LayerNorm(dim, device=device, dtype=dtype)
|
||||
self.mlp = _ViTMLP(dim, mlp_ratio, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
def forward(self, x, freqs_cis=None):
|
||||
shortcut = x
|
||||
x = self.norm1(x)
|
||||
if self.window_size > 0:
|
||||
H, W = x.shape[1], x.shape[2]
|
||||
x, pad_hw = window_partition(x, self.window_size)
|
||||
x = x.view(x.shape[0], self.window_size * self.window_size, -1)
|
||||
x = self.attn(x, freqs_cis=freqs_cis)
|
||||
x = x.view(-1, self.window_size, self.window_size, x.shape[-1])
|
||||
x = window_unpartition(x, self.window_size, pad_hw, (H, W))
|
||||
else:
|
||||
B, H, W, C = x.shape
|
||||
x = x.view(B, H * W, C)
|
||||
x = self.attn(x, freqs_cis=freqs_cis)
|
||||
x = x.view(B, H, W, C)
|
||||
x = shortcut + x
|
||||
x = x + self.mlp(self.norm2(x))
|
||||
return x
|
||||
|
||||
|
||||
class PatchEmbed(nn.Module):
|
||||
def __init__(self, patch_size=14, in_chans=3, embed_dim=1024, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.proj = operations.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=False, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
return self.proj(x)
|
||||
|
||||
|
||||
class ViTDet(nn.Module):
|
||||
def __init__(self, img_size=1008, patch_size=14, embed_dim=1024, depth=32, num_heads=16, mlp_ratio=4.625, qkv_bias=True, window_size=24,
|
||||
global_att_blocks=(7, 15, 23, 31), use_rope=True, pretrain_img_size=336, device=None, dtype=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
self.img_size = img_size
|
||||
self.patch_size = patch_size
|
||||
self.embed_dim = embed_dim
|
||||
self.num_heads = num_heads
|
||||
self.global_att_blocks = set(global_att_blocks)
|
||||
|
||||
self.patch_embed = PatchEmbed(patch_size, 3, embed_dim, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
num_patches = (pretrain_img_size // patch_size) ** 2 + 1 # +1 for cls token
|
||||
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim, device=device, dtype=dtype))
|
||||
|
||||
self.ln_pre = operations.LayerNorm(embed_dim, device=device, dtype=dtype)
|
||||
|
||||
grid_size = img_size // patch_size
|
||||
pretrain_grid = pretrain_img_size // patch_size
|
||||
|
||||
self.blocks = nn.ModuleList()
|
||||
for i in range(depth):
|
||||
is_global = i in self.global_att_blocks
|
||||
self.blocks.append(Block(
|
||||
embed_dim, num_heads, mlp_ratio, qkv_bias,
|
||||
window_size=0 if is_global else window_size,
|
||||
use_rope=use_rope,
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
))
|
||||
|
||||
if use_rope:
|
||||
rope_scale = pretrain_grid / grid_size
|
||||
self.register_buffer("freqs_cis", rope_2d(grid_size, grid_size, embed_dim // num_heads, scale_pos=rope_scale), persistent=False)
|
||||
self.register_buffer("freqs_cis_window", rope_2d(window_size, window_size, embed_dim // num_heads), persistent=False)
|
||||
else:
|
||||
self.freqs_cis = None
|
||||
self.freqs_cis_window = None
|
||||
|
||||
def _get_pos_embed(self, num_tokens):
|
||||
pos = self.pos_embed
|
||||
if pos.shape[1] == num_tokens:
|
||||
return pos
|
||||
cls_pos = pos[:, :1]
|
||||
spatial_pos = pos[:, 1:]
|
||||
old_size = int(math.sqrt(spatial_pos.shape[1]))
|
||||
new_size = int(math.sqrt(num_tokens - 1)) if num_tokens > 1 else old_size
|
||||
spatial_2d = spatial_pos.reshape(1, old_size, old_size, -1).permute(0, 3, 1, 2)
|
||||
tiles_h = new_size // old_size + 1
|
||||
tiles_w = new_size // old_size + 1
|
||||
tiled = spatial_2d.tile([1, 1, tiles_h, tiles_w])[:, :, :new_size, :new_size]
|
||||
tiled = tiled.permute(0, 2, 3, 1).reshape(1, new_size * new_size, -1)
|
||||
return torch.cat([cls_pos, tiled], dim=1)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.patch_embed(x)
|
||||
B, C, Hp, Wp = x.shape
|
||||
x = x.permute(0, 2, 3, 1).reshape(B, Hp * Wp, C)
|
||||
|
||||
pos = cast_to_input(self._get_pos_embed(Hp * Wp + 1), x)
|
||||
x = x + pos[:, 1:Hp * Wp + 1]
|
||||
|
||||
x = x.view(B, Hp, Wp, C)
|
||||
x = self.ln_pre(x)
|
||||
|
||||
freqs_cis_global = self.freqs_cis
|
||||
freqs_cis_win = self.freqs_cis_window
|
||||
if freqs_cis_global is not None:
|
||||
freqs_cis_global = cast_to_input(freqs_cis_global, x)
|
||||
if freqs_cis_win is not None:
|
||||
freqs_cis_win = cast_to_input(freqs_cis_win, x)
|
||||
|
||||
for block in self.blocks:
|
||||
fc = freqs_cis_win if block.window_size > 0 else freqs_cis_global
|
||||
x = block(x, freqs_cis=fc)
|
||||
|
||||
return x.permute(0, 3, 1, 2)
|
||||
|
||||
|
||||
class FPNScaleConv(nn.Module):
|
||||
def __init__(self, in_dim, out_dim, scale, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
if scale == 4.0:
|
||||
self.dconv_2x2_0 = operations.ConvTranspose2d(in_dim, in_dim // 2, kernel_size=2, stride=2, device=device, dtype=dtype)
|
||||
self.dconv_2x2_1 = operations.ConvTranspose2d(in_dim // 2, in_dim // 4, kernel_size=2, stride=2, device=device, dtype=dtype)
|
||||
proj_in = in_dim // 4
|
||||
elif scale == 2.0:
|
||||
self.dconv_2x2 = operations.ConvTranspose2d(in_dim, in_dim // 2, kernel_size=2, stride=2, device=device, dtype=dtype)
|
||||
proj_in = in_dim // 2
|
||||
elif scale == 1.0:
|
||||
proj_in = in_dim
|
||||
elif scale == 0.5:
|
||||
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
|
||||
proj_in = in_dim
|
||||
self.scale = scale
|
||||
self.conv_1x1 = operations.Conv2d(proj_in, out_dim, kernel_size=1, device=device, dtype=dtype)
|
||||
self.conv_3x3 = operations.Conv2d(out_dim, out_dim, kernel_size=3, padding=1, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
if self.scale == 4.0:
|
||||
x = F.gelu(self.dconv_2x2_0(x))
|
||||
x = self.dconv_2x2_1(x)
|
||||
elif self.scale == 2.0:
|
||||
x = self.dconv_2x2(x)
|
||||
elif self.scale == 0.5:
|
||||
x = self.pool(x)
|
||||
x = self.conv_1x1(x)
|
||||
x = self.conv_3x3(x)
|
||||
return x
|
||||
|
||||
|
||||
class PositionEmbeddingSine(nn.Module):
|
||||
"""2D sinusoidal position encoding (DETR-style) with result caching."""
|
||||
def __init__(self, num_pos_feats=256, temperature=10000.0, normalize=True, scale=None):
|
||||
super().__init__()
|
||||
assert num_pos_feats % 2 == 0
|
||||
self.half_dim = num_pos_feats // 2
|
||||
self.temperature = temperature
|
||||
self.normalize = normalize
|
||||
self.scale = scale if scale is not None else 2 * math.pi
|
||||
self._cache = {}
|
||||
|
||||
def _sincos(self, vals):
|
||||
"""Encode 1D values to interleaved sin/cos features."""
|
||||
freqs = self.temperature ** (2 * (torch.arange(self.half_dim, dtype=torch.float32, device=vals.device) // 2) / self.half_dim)
|
||||
raw = vals[..., None] * self.scale / freqs
|
||||
return torch.stack((raw[..., 0::2].sin(), raw[..., 1::2].cos()), dim=-1).flatten(-2)
|
||||
|
||||
def _encode_xy(self, x, y):
|
||||
"""Encode normalized x, y coordinates to sinusoidal features. Returns (pos_x, pos_y) each [N, half_dim]."""
|
||||
dim_t = self.temperature ** (2 * (torch.arange(self.half_dim, dtype=torch.float32, device=x.device) // 2) / self.half_dim)
|
||||
pos_x = x[:, None] * self.scale / dim_t
|
||||
pos_y = y[:, None] * self.scale / dim_t
|
||||
pos_x = torch.stack((pos_x[:, 0::2].sin(), pos_x[:, 1::2].cos()), dim=2).flatten(1)
|
||||
pos_y = torch.stack((pos_y[:, 0::2].sin(), pos_y[:, 1::2].cos()), dim=2).flatten(1)
|
||||
return pos_x, pos_y
|
||||
|
||||
def encode_boxes(self, cx, cy, w, h):
|
||||
"""Encode box center + size to [N, d_model+2] features."""
|
||||
pos_x, pos_y = self._encode_xy(cx, cy)
|
||||
return torch.cat((pos_y, pos_x, h[:, None], w[:, None]), dim=1)
|
||||
|
||||
def forward(self, x):
|
||||
B, C, H, W = x.shape
|
||||
key = (H, W, x.device)
|
||||
if key not in self._cache:
|
||||
gy = torch.arange(H, dtype=torch.float32, device=x.device)
|
||||
gx = torch.arange(W, dtype=torch.float32, device=x.device)
|
||||
if self.normalize:
|
||||
gy, gx = gy / (H - 1 + 1e-6), gx / (W - 1 + 1e-6)
|
||||
yy, xx = torch.meshgrid(gy, gx, indexing="ij")
|
||||
self._cache[key] = torch.cat((self._sincos(yy), self._sincos(xx)), dim=-1).permute(2, 0, 1).unsqueeze(0)
|
||||
return self._cache[key].expand(B, -1, -1, -1)
|
||||
|
||||
|
||||
class SAM3VisionBackbone(nn.Module):
|
||||
def __init__(self, embed_dim=1024, d_model=256, multiplex=False, device=None, dtype=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
self.trunk = ViTDet(embed_dim=embed_dim, device=device, dtype=dtype, operations=operations, **kwargs)
|
||||
self.position_encoding = PositionEmbeddingSine(num_pos_feats=d_model, normalize=True)
|
||||
self.multiplex = multiplex
|
||||
|
||||
fpn_args = dict(device=device, dtype=dtype, operations=operations)
|
||||
if multiplex:
|
||||
scales = [4.0, 2.0, 1.0]
|
||||
self.convs = nn.ModuleList([FPNScaleConv(embed_dim, d_model, s, **fpn_args) for s in scales])
|
||||
self.propagation_convs = nn.ModuleList([FPNScaleConv(embed_dim, d_model, s, **fpn_args) for s in scales])
|
||||
self.interactive_convs = nn.ModuleList([FPNScaleConv(embed_dim, d_model, s, **fpn_args) for s in scales])
|
||||
else:
|
||||
scales = [4.0, 2.0, 1.0, 0.5]
|
||||
self.convs = nn.ModuleList([FPNScaleConv(embed_dim, d_model, s, **fpn_args) for s in scales])
|
||||
self.sam2_convs = nn.ModuleList([FPNScaleConv(embed_dim, d_model, s, **fpn_args) for s in scales])
|
||||
|
||||
def forward(self, images, need_tracker=False, tracker_mode=None, cached_trunk=None, tracker_only=False):
|
||||
backbone_out = cached_trunk if cached_trunk is not None else self.trunk(images)
|
||||
|
||||
if tracker_only:
|
||||
# Skip detector FPN when only tracker features are needed (video tracking)
|
||||
if self.multiplex:
|
||||
tracker_convs = self.propagation_convs if tracker_mode == "propagation" else self.interactive_convs
|
||||
else:
|
||||
tracker_convs = self.sam2_convs
|
||||
tracker_features = [conv(backbone_out) for conv in tracker_convs]
|
||||
tracker_positions = [cast_to_input(self.position_encoding(f), f) for f in tracker_features]
|
||||
return None, None, tracker_features, tracker_positions
|
||||
|
||||
features = [conv(backbone_out) for conv in self.convs]
|
||||
positions = [cast_to_input(self.position_encoding(f), f) for f in features]
|
||||
|
||||
if self.multiplex:
|
||||
if tracker_mode == "propagation":
|
||||
tracker_convs = self.propagation_convs
|
||||
elif tracker_mode == "interactive":
|
||||
tracker_convs = self.interactive_convs
|
||||
else:
|
||||
return features, positions, None, None
|
||||
elif need_tracker:
|
||||
tracker_convs = self.sam2_convs
|
||||
else:
|
||||
return features, positions, None, None
|
||||
|
||||
tracker_features = [conv(backbone_out) for conv in tracker_convs]
|
||||
tracker_positions = [cast_to_input(self.position_encoding(f), f) for f in tracker_features]
|
||||
return features, positions, tracker_features, tracker_positions
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,226 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from comfy.ldm.modules.diffusionmodules.util import timestep_embedding
|
||||
from comfy.ldm.modules.diffusionmodules.openaimodel import Downsample, TimestepEmbedSequential, ResBlock, SpatialTransformer
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
|
||||
|
||||
class ZeroSFT(nn.Module):
|
||||
def __init__(self, label_nc, norm_nc, concat_channels=0, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
|
||||
ks = 3
|
||||
pw = ks // 2
|
||||
|
||||
self.param_free_norm = operations.GroupNorm(32, norm_nc + concat_channels, dtype=dtype, device=device)
|
||||
|
||||
nhidden = 128
|
||||
|
||||
self.mlp_shared = nn.Sequential(
|
||||
operations.Conv2d(label_nc, nhidden, kernel_size=ks, padding=pw, dtype=dtype, device=device),
|
||||
nn.SiLU()
|
||||
)
|
||||
self.zero_mul = operations.Conv2d(nhidden, norm_nc + concat_channels, kernel_size=ks, padding=pw, dtype=dtype, device=device)
|
||||
self.zero_add = operations.Conv2d(nhidden, norm_nc + concat_channels, kernel_size=ks, padding=pw, dtype=dtype, device=device)
|
||||
|
||||
self.zero_conv = operations.Conv2d(label_nc, norm_nc, 1, 1, 0, dtype=dtype, device=device)
|
||||
self.pre_concat = bool(concat_channels != 0)
|
||||
|
||||
def forward(self, c, h, h_ori=None, control_scale=1):
|
||||
if h_ori is not None and self.pre_concat:
|
||||
h_raw = torch.cat([h_ori, h], dim=1)
|
||||
else:
|
||||
h_raw = h
|
||||
|
||||
h = h + self.zero_conv(c)
|
||||
if h_ori is not None and self.pre_concat:
|
||||
h = torch.cat([h_ori, h], dim=1)
|
||||
actv = self.mlp_shared(c)
|
||||
gamma = self.zero_mul(actv)
|
||||
beta = self.zero_add(actv)
|
||||
h = self.param_free_norm(h)
|
||||
h = torch.addcmul(h + beta, h, gamma)
|
||||
if h_ori is not None and not self.pre_concat:
|
||||
h = torch.cat([h_ori, h], dim=1)
|
||||
return torch.lerp(h_raw, h, control_scale)
|
||||
|
||||
|
||||
class _CrossAttnInner(nn.Module):
|
||||
"""Inner cross-attention module matching the state_dict layout of the original CrossAttention."""
|
||||
def __init__(self, query_dim, context_dim, heads, dim_head, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
inner_dim = dim_head * heads
|
||||
self.heads = heads
|
||||
self.to_q = operations.Linear(query_dim, inner_dim, bias=False, dtype=dtype, device=device)
|
||||
self.to_k = operations.Linear(context_dim, inner_dim, bias=False, dtype=dtype, device=device)
|
||||
self.to_v = operations.Linear(context_dim, inner_dim, bias=False, dtype=dtype, device=device)
|
||||
self.to_out = nn.Sequential(
|
||||
operations.Linear(inner_dim, query_dim, dtype=dtype, device=device),
|
||||
)
|
||||
|
||||
def forward(self, x, context):
|
||||
q = self.to_q(x)
|
||||
k = self.to_k(context)
|
||||
v = self.to_v(context)
|
||||
return self.to_out(optimized_attention(q, k, v, self.heads))
|
||||
|
||||
|
||||
class ZeroCrossAttn(nn.Module):
|
||||
def __init__(self, context_dim, query_dim, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
heads = query_dim // 64
|
||||
dim_head = 64
|
||||
self.attn = _CrossAttnInner(query_dim, context_dim, heads, dim_head, dtype=dtype, device=device, operations=operations)
|
||||
self.norm1 = operations.GroupNorm(32, query_dim, dtype=dtype, device=device)
|
||||
self.norm2 = operations.GroupNorm(32, context_dim, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, context, x, control_scale=1):
|
||||
b, c, h, w = x.shape
|
||||
x_in = x
|
||||
|
||||
x = self.attn(
|
||||
self.norm1(x).flatten(2).transpose(1, 2),
|
||||
self.norm2(context).flatten(2).transpose(1, 2),
|
||||
).transpose(1, 2).unflatten(2, (h, w))
|
||||
|
||||
return x_in + x * control_scale
|
||||
|
||||
|
||||
class GLVControl(nn.Module):
|
||||
"""SUPIR's Guided Latent Vector control encoder. Truncated UNet (input + middle blocks only)."""
|
||||
def __init__(
|
||||
self,
|
||||
in_channels=4,
|
||||
model_channels=320,
|
||||
num_res_blocks=2,
|
||||
attention_resolutions=(4, 2),
|
||||
channel_mult=(1, 2, 4),
|
||||
num_head_channels=64,
|
||||
transformer_depth=(1, 2, 10),
|
||||
context_dim=2048,
|
||||
adm_in_channels=2816,
|
||||
use_linear_in_transformer=True,
|
||||
use_checkpoint=False,
|
||||
dtype=None,
|
||||
device=None,
|
||||
operations=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__()
|
||||
self.model_channels = model_channels
|
||||
time_embed_dim = model_channels * 4
|
||||
|
||||
self.time_embed = nn.Sequential(
|
||||
operations.Linear(model_channels, time_embed_dim, dtype=dtype, device=device),
|
||||
nn.SiLU(),
|
||||
operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device),
|
||||
)
|
||||
|
||||
self.label_emb = nn.Sequential(
|
||||
nn.Sequential(
|
||||
operations.Linear(adm_in_channels, time_embed_dim, dtype=dtype, device=device),
|
||||
nn.SiLU(),
|
||||
operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device),
|
||||
)
|
||||
)
|
||||
|
||||
self.input_blocks = nn.ModuleList([
|
||||
TimestepEmbedSequential(
|
||||
operations.Conv2d(in_channels, model_channels, 3, padding=1, dtype=dtype, device=device)
|
||||
)
|
||||
])
|
||||
ch = model_channels
|
||||
ds = 1
|
||||
for level, mult in enumerate(channel_mult):
|
||||
for nr in range(num_res_blocks):
|
||||
layers = [
|
||||
ResBlock(ch, time_embed_dim, 0, out_channels=mult * model_channels,
|
||||
dtype=dtype, device=device, operations=operations)
|
||||
]
|
||||
ch = mult * model_channels
|
||||
if ds in attention_resolutions:
|
||||
num_heads = ch // num_head_channels
|
||||
layers.append(
|
||||
SpatialTransformer(ch, num_heads, num_head_channels,
|
||||
depth=transformer_depth[level], context_dim=context_dim,
|
||||
use_linear=use_linear_in_transformer,
|
||||
use_checkpoint=use_checkpoint,
|
||||
dtype=dtype, device=device, operations=operations)
|
||||
)
|
||||
self.input_blocks.append(TimestepEmbedSequential(*layers))
|
||||
if level != len(channel_mult) - 1:
|
||||
self.input_blocks.append(
|
||||
TimestepEmbedSequential(
|
||||
Downsample(ch, True, out_channels=ch, dtype=dtype, device=device, operations=operations)
|
||||
)
|
||||
)
|
||||
ds *= 2
|
||||
|
||||
num_heads = ch // num_head_channels
|
||||
self.middle_block = TimestepEmbedSequential(
|
||||
ResBlock(ch, time_embed_dim, 0, dtype=dtype, device=device, operations=operations),
|
||||
SpatialTransformer(ch, num_heads, num_head_channels,
|
||||
depth=transformer_depth[-1], context_dim=context_dim,
|
||||
use_linear=use_linear_in_transformer,
|
||||
use_checkpoint=use_checkpoint,
|
||||
dtype=dtype, device=device, operations=operations),
|
||||
ResBlock(ch, time_embed_dim, 0, dtype=dtype, device=device, operations=operations),
|
||||
)
|
||||
|
||||
self.input_hint_block = TimestepEmbedSequential(
|
||||
operations.Conv2d(in_channels, model_channels, 3, padding=1, dtype=dtype, device=device)
|
||||
)
|
||||
|
||||
def forward(self, x, timesteps, xt, context=None, y=None, **kwargs):
|
||||
t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype)
|
||||
emb = self.time_embed(t_emb) + self.label_emb(y)
|
||||
|
||||
guided_hint = self.input_hint_block(x, emb, context)
|
||||
|
||||
hs = []
|
||||
h = xt
|
||||
for module in self.input_blocks:
|
||||
if guided_hint is not None:
|
||||
h = module(h, emb, context)
|
||||
h += guided_hint
|
||||
guided_hint = None
|
||||
else:
|
||||
h = module(h, emb, context)
|
||||
hs.append(h)
|
||||
h = self.middle_block(h, emb, context)
|
||||
hs.append(h)
|
||||
return hs
|
||||
|
||||
|
||||
class SUPIR(nn.Module):
|
||||
"""
|
||||
SUPIR model containing GLVControl (control encoder) and project_modules (adapters).
|
||||
State dict keys match the original SUPIR checkpoint layout:
|
||||
control_model.* -> GLVControl
|
||||
project_modules.* -> nn.ModuleList of ZeroSFT/ZeroCrossAttn
|
||||
"""
|
||||
def __init__(self, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
|
||||
self.control_model = GLVControl(dtype=dtype, device=device, operations=operations)
|
||||
|
||||
project_channel_scale = 2
|
||||
cond_output_channels = [320] * 4 + [640] * 3 + [1280] * 3
|
||||
project_channels = [int(c * project_channel_scale) for c in [160] * 4 + [320] * 3 + [640] * 3]
|
||||
concat_channels = [320] * 2 + [640] * 3 + [1280] * 4 + [0]
|
||||
cross_attn_insert_idx = [6, 3]
|
||||
|
||||
self.project_modules = nn.ModuleList()
|
||||
for i in range(len(cond_output_channels)):
|
||||
self.project_modules.append(ZeroSFT(
|
||||
project_channels[i], cond_output_channels[i],
|
||||
concat_channels=concat_channels[i],
|
||||
dtype=dtype, device=device, operations=operations,
|
||||
))
|
||||
|
||||
for i in cross_attn_insert_idx:
|
||||
self.project_modules.insert(i, ZeroCrossAttn(
|
||||
cond_output_channels[i], concat_channels[i],
|
||||
dtype=dtype, device=device, operations=operations,
|
||||
))
|
||||
|
|
@ -0,0 +1,103 @@
|
|||
import torch
|
||||
from comfy.ldm.modules.diffusionmodules.openaimodel import Upsample
|
||||
|
||||
|
||||
class SUPIRPatch:
|
||||
"""
|
||||
Holds GLVControl (control encoder) + project_modules (ZeroSFT/ZeroCrossAttn adapters).
|
||||
Runs GLVControl lazily on first patch invocation per step, applies adapters through
|
||||
middle_block_after_patch, output_block_merge_patch, and forward_timestep_embed_patch.
|
||||
"""
|
||||
SIGMA_MAX = 14.6146
|
||||
|
||||
def __init__(self, model_patch, project_modules, hint_latent, strength_start, strength_end):
|
||||
self.model_patch = model_patch # CoreModelPatcher wrapping GLVControl
|
||||
self.project_modules = project_modules # nn.ModuleList of ZeroSFT/ZeroCrossAttn
|
||||
self.hint_latent = hint_latent # encoded LQ image latent
|
||||
self.strength_start = strength_start
|
||||
self.strength_end = strength_end
|
||||
self.cached_features = None
|
||||
self.adapter_idx = 0
|
||||
self.control_idx = 0
|
||||
self.current_control_idx = 0
|
||||
self.active = True
|
||||
|
||||
def _ensure_features(self, kwargs):
|
||||
"""Run GLVControl on first call per step, cache results."""
|
||||
if self.cached_features is not None:
|
||||
return
|
||||
x = kwargs["x"]
|
||||
b = x.shape[0]
|
||||
hint = self.hint_latent.to(device=x.device, dtype=x.dtype)
|
||||
if hint.shape[0] != b:
|
||||
hint = hint.expand(b, -1, -1, -1) if hint.shape[0] == 1 else hint.repeat((b + hint.shape[0] - 1) // hint.shape[0], 1, 1, 1)[:b]
|
||||
self.cached_features = self.model_patch.model.control_model(
|
||||
hint, kwargs["timesteps"], x,
|
||||
kwargs["context"], kwargs["y"]
|
||||
)
|
||||
self.adapter_idx = len(self.project_modules) - 1
|
||||
self.control_idx = len(self.cached_features) - 1
|
||||
|
||||
def _get_control_scale(self, kwargs):
|
||||
if self.strength_start == self.strength_end:
|
||||
return self.strength_end
|
||||
sigma = kwargs["transformer_options"].get("sigmas")
|
||||
if sigma is None:
|
||||
return self.strength_end
|
||||
s = sigma[0].item() if sigma.dim() > 0 else sigma.item()
|
||||
t = min(s / self.SIGMA_MAX, 1.0)
|
||||
return t * (self.strength_start - self.strength_end) + self.strength_end
|
||||
|
||||
def middle_after(self, kwargs):
|
||||
"""middle_block_after_patch: run GLVControl lazily, apply last adapter after middle block."""
|
||||
self.cached_features = None # reset from previous step
|
||||
self.current_scale = self._get_control_scale(kwargs)
|
||||
self.active = self.current_scale > 0
|
||||
if not self.active:
|
||||
return {"h": kwargs["h"]}
|
||||
self._ensure_features(kwargs)
|
||||
h = kwargs["h"]
|
||||
h = self.project_modules[self.adapter_idx](
|
||||
self.cached_features[self.control_idx], h, control_scale=self.current_scale
|
||||
)
|
||||
self.adapter_idx -= 1
|
||||
self.control_idx -= 1
|
||||
return {"h": h}
|
||||
|
||||
def output_block(self, h, hsp, transformer_options):
|
||||
"""output_block_patch: ZeroSFT adapter fusion replaces cat([h, hsp]). Returns (h, None) to skip cat."""
|
||||
if not self.active:
|
||||
return h, hsp
|
||||
self.current_control_idx = self.control_idx
|
||||
h = self.project_modules[self.adapter_idx](
|
||||
self.cached_features[self.control_idx], hsp, h, control_scale=self.current_scale
|
||||
)
|
||||
self.adapter_idx -= 1
|
||||
self.control_idx -= 1
|
||||
return h, None
|
||||
|
||||
def pre_upsample(self, layer, x, emb, context, transformer_options, output_shape, *args, **kw):
|
||||
"""forward_timestep_embed_patch for Upsample: extra cross-attn adapter before upsample."""
|
||||
block_type, _ = transformer_options["block"]
|
||||
if block_type == "output" and self.active and self.cached_features is not None:
|
||||
x = self.project_modules[self.adapter_idx](
|
||||
self.cached_features[self.current_control_idx], x, control_scale=self.current_scale
|
||||
)
|
||||
self.adapter_idx -= 1
|
||||
return layer(x, output_shape=output_shape)
|
||||
|
||||
def to(self, device_or_dtype):
|
||||
if isinstance(device_or_dtype, torch.device):
|
||||
self.cached_features = None
|
||||
if self.hint_latent is not None:
|
||||
self.hint_latent = self.hint_latent.to(device_or_dtype)
|
||||
return self
|
||||
|
||||
def models(self):
|
||||
return [self.model_patch]
|
||||
|
||||
def register(self, model_patcher):
|
||||
"""Register all patches on a cloned model patcher."""
|
||||
model_patcher.set_model_patch(self.middle_after, "middle_block_after_patch")
|
||||
model_patcher.set_model_output_block_patch(self.output_block)
|
||||
model_patcher.set_model_patch((Upsample, self.pre_upsample), "forward_timestep_embed_patch")
|
||||
|
|
@ -141,3 +141,17 @@ def interpret_gathered_like(tensors, gathered):
|
|||
return dest_views
|
||||
|
||||
aimdo_enabled = False
|
||||
|
||||
extra_ram_release_callback = None
|
||||
RAM_CACHE_HEADROOM = 0
|
||||
|
||||
def set_ram_cache_release_state(callback, headroom):
|
||||
global extra_ram_release_callback
|
||||
global RAM_CACHE_HEADROOM
|
||||
extra_ram_release_callback = callback
|
||||
RAM_CACHE_HEADROOM = max(0, int(headroom))
|
||||
|
||||
def extra_ram_release(target):
|
||||
if extra_ram_release_callback is None:
|
||||
return 0
|
||||
return extra_ram_release_callback(target)
|
||||
|
|
|
|||
|
|
@ -52,6 +52,9 @@ import comfy.ldm.qwen_image.model
|
|||
import comfy.ldm.kandinsky5.model
|
||||
import comfy.ldm.anima.model
|
||||
import comfy.ldm.ace.ace_step15
|
||||
import comfy.ldm.rt_detr.rtdetr_v4
|
||||
import comfy.ldm.ernie.model
|
||||
import comfy.ldm.sam3.detector
|
||||
|
||||
import comfy.model_management
|
||||
import comfy.patcher_extension
|
||||
|
|
@ -576,8 +579,8 @@ class Stable_Zero123(BaseModel):
|
|||
def __init__(self, model_config, model_type=ModelType.EPS, device=None, cc_projection_weight=None, cc_projection_bias=None):
|
||||
super().__init__(model_config, model_type, device=device)
|
||||
self.cc_projection = comfy.ops.manual_cast.Linear(cc_projection_weight.shape[1], cc_projection_weight.shape[0], dtype=self.get_dtype(), device=device)
|
||||
self.cc_projection.weight.copy_(cc_projection_weight)
|
||||
self.cc_projection.bias.copy_(cc_projection_bias)
|
||||
self.cc_projection.weight = torch.nn.Parameter(cc_projection_weight.clone())
|
||||
self.cc_projection.bias = torch.nn.Parameter(cc_projection_bias.clone())
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = {}
|
||||
|
|
@ -890,7 +893,7 @@ class Flux(BaseModel):
|
|||
return torch.cat((image, mask), dim=1)
|
||||
|
||||
def encode_adm(self, **kwargs):
|
||||
return kwargs["pooled_output"]
|
||||
return kwargs.get("pooled_output", None)
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = super().extra_conds(**kwargs)
|
||||
|
|
@ -937,9 +940,10 @@ class LongCatImage(Flux):
|
|||
transformer_options = transformer_options.copy()
|
||||
rope_opts = transformer_options.get("rope_options", {})
|
||||
rope_opts = dict(rope_opts)
|
||||
pe_len = float(c_crossattn.shape[1]) if c_crossattn is not None else 512.0
|
||||
rope_opts.setdefault("shift_t", 1.0)
|
||||
rope_opts.setdefault("shift_y", 512.0)
|
||||
rope_opts.setdefault("shift_x", 512.0)
|
||||
rope_opts.setdefault("shift_y", pe_len)
|
||||
rope_opts.setdefault("shift_x", pe_len)
|
||||
transformer_options["rope_options"] = rope_opts
|
||||
return super()._apply_model(x, t, c_concat, c_crossattn, control, transformer_options, **kwargs)
|
||||
|
||||
|
|
@ -1060,6 +1064,10 @@ class LTXAV(BaseModel):
|
|||
if guide_attention_entries is not None:
|
||||
out['guide_attention_entries'] = comfy.conds.CONDConstant(guide_attention_entries)
|
||||
|
||||
ref_audio = kwargs.get("ref_audio", None)
|
||||
if ref_audio is not None:
|
||||
out['ref_audio'] = comfy.conds.CONDConstant(ref_audio)
|
||||
|
||||
return out
|
||||
|
||||
def process_timestep(self, timestep, x, denoise_mask=None, audio_denoise_mask=None, **kwargs):
|
||||
|
|
@ -1952,3 +1960,22 @@ class Kandinsky5Image(Kandinsky5):
|
|||
|
||||
def concat_cond(self, **kwargs):
|
||||
return None
|
||||
|
||||
class RT_DETR_v4(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.rt_detr.rtdetr_v4.RTv4)
|
||||
|
||||
class ErnieImage(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.ernie.model.ErnieImageModel)
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = super().extra_conds(**kwargs)
|
||||
cross_attn = kwargs.get("cross_attn", None)
|
||||
if cross_attn is not None:
|
||||
out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
|
||||
return out
|
||||
|
||||
class SAM3(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.sam3.detector.SAM3Model)
|
||||
|
|
|
|||
|
|
@ -696,8 +696,36 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
|||
if '{}encoder.lyric_encoder.layers.0.input_layernorm.weight'.format(key_prefix) in state_dict_keys:
|
||||
dit_config = {}
|
||||
dit_config["audio_model"] = "ace1.5"
|
||||
head_dim = 128
|
||||
dit_config["hidden_size"] = state_dict['{}decoder.layers.0.self_attn_norm.weight'.format(key_prefix)].shape[0]
|
||||
dit_config["intermediate_size"] = state_dict['{}decoder.layers.0.mlp.gate_proj.weight'.format(key_prefix)].shape[0]
|
||||
dit_config["num_heads"] = state_dict['{}decoder.layers.0.self_attn.q_proj.weight'.format(key_prefix)].shape[0] // head_dim
|
||||
|
||||
dit_config["encoder_hidden_size"] = state_dict['{}encoder.lyric_encoder.layers.0.input_layernorm.weight'.format(key_prefix)].shape[0]
|
||||
dit_config["encoder_num_heads"] = state_dict['{}encoder.lyric_encoder.layers.0.self_attn.q_proj.weight'.format(key_prefix)].shape[0] // head_dim
|
||||
dit_config["encoder_intermediate_size"] = state_dict['{}encoder.lyric_encoder.layers.0.mlp.gate_proj.weight'.format(key_prefix)].shape[0]
|
||||
dit_config["num_dit_layers"] = count_blocks(state_dict_keys, '{}decoder.layers.'.format(key_prefix) + '{}.')
|
||||
return dit_config
|
||||
|
||||
if '{}encoder.pan_blocks.1.cv4.conv.weight'.format(key_prefix) in state_dict_keys: # RT-DETR_v4
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "RT_DETR_v4"
|
||||
dit_config["enc_h"] = state_dict['{}encoder.pan_blocks.1.cv4.conv.weight'.format(key_prefix)].shape[0]
|
||||
return dit_config
|
||||
|
||||
if '{}layers.0.mlp.linear_fc2.weight'.format(key_prefix) in state_dict_keys: # Ernie Image
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "ernie"
|
||||
return dit_config
|
||||
|
||||
if 'detector.backbone.vision_backbone.trunk.blocks.0.attn.qkv.weight' in state_dict_keys: # SAM3 / SAM3.1
|
||||
if 'detector.transformer.decoder.query_embed.weight' in state_dict_keys:
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "SAM3"
|
||||
if 'detector.backbone.vision_backbone.propagation_convs.0.conv_1x1.weight' in state_dict_keys:
|
||||
dit_config["image_model"] = "SAM31"
|
||||
return dit_config
|
||||
|
||||
if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
|
||||
return None
|
||||
|
||||
|
|
@ -853,6 +881,10 @@ def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=Fal
|
|||
return model_config
|
||||
|
||||
def unet_prefix_from_state_dict(state_dict):
|
||||
# SAM3: detector.* and tracker.* at top level, no common prefix
|
||||
if any(k.startswith("detector.") for k in state_dict) and any(k.startswith("tracker.") for k in state_dict):
|
||||
return ""
|
||||
|
||||
candidates = ["model.diffusion_model.", #ldm/sgm models
|
||||
"model.model.", #audio models
|
||||
"net.", #cosmos
|
||||
|
|
|
|||
|
|
@ -55,6 +55,7 @@ total_vram = 0
|
|||
|
||||
# Training Related State
|
||||
in_training = False
|
||||
training_fp8_bwd = False
|
||||
|
||||
|
||||
def get_supported_float8_types():
|
||||
|
|
@ -668,7 +669,7 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins
|
|||
|
||||
for i in range(len(current_loaded_models) -1, -1, -1):
|
||||
shift_model = current_loaded_models[i]
|
||||
if shift_model.device == device:
|
||||
if device is None or shift_model.device == device:
|
||||
if shift_model not in keep_loaded and not shift_model.is_dead():
|
||||
can_unload.append((-shift_model.model_offloaded_memory(), sys.getrefcount(shift_model.model), shift_model.model_memory(), i))
|
||||
shift_model.currently_used = False
|
||||
|
|
@ -678,8 +679,8 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins
|
|||
i = x[-1]
|
||||
memory_to_free = 1e32
|
||||
pins_to_free = 1e32
|
||||
if not DISABLE_SMART_MEMORY:
|
||||
memory_to_free = memory_required - get_free_memory(device)
|
||||
if not DISABLE_SMART_MEMORY or device is None:
|
||||
memory_to_free = 0 if device is None else memory_required - get_free_memory(device)
|
||||
pins_to_free = pins_required - get_free_ram()
|
||||
if current_loaded_models[i].model.is_dynamic() and for_dynamic:
|
||||
#don't actually unload dynamic models for the sake of other dynamic models
|
||||
|
|
@ -707,7 +708,7 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins
|
|||
|
||||
if len(unloaded_model) > 0:
|
||||
soft_empty_cache()
|
||||
else:
|
||||
elif device is not None:
|
||||
if vram_state != VRAMState.HIGH_VRAM:
|
||||
mem_free_total, mem_free_torch = get_free_memory(device, torch_free_too=True)
|
||||
if mem_free_torch > mem_free_total * 0.25:
|
||||
|
|
@ -1325,9 +1326,9 @@ MAX_PINNED_MEMORY = -1
|
|||
if not args.disable_pinned_memory:
|
||||
if is_nvidia() or is_amd():
|
||||
if WINDOWS:
|
||||
MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.45 # Windows limit is apparently 50%
|
||||
MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.40 # Windows limit is apparently 50%
|
||||
else:
|
||||
MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.95
|
||||
MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.90
|
||||
logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024)))
|
||||
|
||||
PINNING_ALLOWED_TYPES = set(["Tensor", "Parameter", "QuantizedTensor"])
|
||||
|
|
@ -1402,8 +1403,6 @@ def unpin_memory(tensor):
|
|||
|
||||
if torch.cuda.cudart().cudaHostUnregister(ptr) == 0:
|
||||
TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr)
|
||||
if len(PINNED_MEMORY) == 0:
|
||||
TOTAL_PINNED_MEMORY = 0
|
||||
return True
|
||||
else:
|
||||
logging.warning("Unpin error.")
|
||||
|
|
@ -1733,6 +1732,21 @@ def supports_mxfp8_compute(device=None):
|
|||
|
||||
return True
|
||||
|
||||
def supports_fp64(device=None):
|
||||
if is_device_mps(device):
|
||||
return False
|
||||
|
||||
if is_intel_xpu():
|
||||
return False
|
||||
|
||||
if is_directml_enabled():
|
||||
return False
|
||||
|
||||
if is_ixuca():
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def extended_fp16_support():
|
||||
# TODO: check why some models work with fp16 on newer torch versions but not on older
|
||||
if torch_version_numeric < (2, 7):
|
||||
|
|
@ -1787,7 +1801,7 @@ def debug_memory_summary():
|
|||
return torch.cuda.memory.memory_summary()
|
||||
return ""
|
||||
|
||||
class InterruptProcessingException(Exception):
|
||||
class InterruptProcessingException(BaseException):
|
||||
pass
|
||||
|
||||
interrupt_processing_mutex = threading.RLock()
|
||||
|
|
|
|||
|
|
@ -300,9 +300,6 @@ class ModelPatcher:
|
|||
def model_mmap_residency(self, free=False):
|
||||
return comfy.model_management.module_mmap_residency(self.model, free=free)
|
||||
|
||||
def get_ram_usage(self):
|
||||
return self.model_size()
|
||||
|
||||
def loaded_size(self):
|
||||
return self.model.model_loaded_weight_memory
|
||||
|
||||
|
|
@ -509,6 +506,10 @@ class ModelPatcher:
|
|||
def set_model_noise_refiner_patch(self, patch):
|
||||
self.set_model_patch(patch, "noise_refiner")
|
||||
|
||||
def set_model_middle_block_after_patch(self, patch):
|
||||
self.set_model_patch(patch, "middle_block_after_patch")
|
||||
|
||||
|
||||
def set_model_rope_options(self, scale_x, shift_x, scale_y, shift_y, scale_t, shift_t, **kwargs):
|
||||
rope_options = self.model_options["transformer_options"].get("rope_options", {})
|
||||
rope_options["scale_x"] = scale_x
|
||||
|
|
@ -684,9 +685,9 @@ class ModelPatcher:
|
|||
sd.pop(k)
|
||||
return sd
|
||||
|
||||
def patch_weight_to_device(self, key, device_to=None, inplace_update=False, return_weight=False):
|
||||
def patch_weight_to_device(self, key, device_to=None, inplace_update=False, return_weight=False, force_cast=False):
|
||||
weight, set_func, convert_func = get_key_weight(self.model, key)
|
||||
if key not in self.patches:
|
||||
if key not in self.patches and not force_cast:
|
||||
return weight
|
||||
|
||||
inplace_update = self.weight_inplace_update or inplace_update
|
||||
|
|
@ -694,7 +695,7 @@ class ModelPatcher:
|
|||
if key not in self.backup and not return_weight:
|
||||
self.backup[key] = collections.namedtuple('Dimension', ['weight', 'inplace_update'])(weight.to(device=self.offload_device, copy=inplace_update), inplace_update)
|
||||
|
||||
temp_dtype = comfy.model_management.lora_compute_dtype(device_to)
|
||||
temp_dtype = comfy.model_management.lora_compute_dtype(device_to) if key in self.patches else None
|
||||
if device_to is not None:
|
||||
temp_weight = comfy.model_management.cast_to_device(weight, device_to, temp_dtype, copy=True)
|
||||
else:
|
||||
|
|
@ -702,9 +703,10 @@ class ModelPatcher:
|
|||
if convert_func is not None:
|
||||
temp_weight = convert_func(temp_weight, inplace=True)
|
||||
|
||||
out_weight = comfy.lora.calculate_weight(self.patches[key], temp_weight, key)
|
||||
out_weight = comfy.lora.calculate_weight(self.patches[key], temp_weight, key) if key in self.patches else temp_weight
|
||||
if set_func is None:
|
||||
out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=comfy.utils.string_to_seed(key))
|
||||
if key in self.patches:
|
||||
out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=comfy.utils.string_to_seed(key))
|
||||
if return_weight:
|
||||
return out_weight
|
||||
elif inplace_update:
|
||||
|
|
@ -1583,7 +1585,7 @@ class ModelPatcherDynamic(ModelPatcher):
|
|||
key = key_param_name_to_key(n, param_key)
|
||||
if key in self.backup:
|
||||
comfy.utils.set_attr_param(self.model, key, self.backup[key].weight)
|
||||
self.patch_weight_to_device(key, device_to=device_to)
|
||||
self.patch_weight_to_device(key, device_to=device_to, force_cast=True)
|
||||
weight, _, _ = get_key_weight(self.model, key)
|
||||
if weight is not None:
|
||||
self.model.model_loaded_weight_memory += weight.numel() * weight.element_size()
|
||||
|
|
@ -1608,6 +1610,10 @@ class ModelPatcherDynamic(ModelPatcher):
|
|||
m._v = vbar.alloc(v_weight_size)
|
||||
allocated_size += v_weight_size
|
||||
|
||||
for param in params:
|
||||
if param not in ("weight", "bias"):
|
||||
force_load_param(self, param, device_to)
|
||||
|
||||
else:
|
||||
for param in params:
|
||||
key = key_param_name_to_key(n, param)
|
||||
|
|
|
|||
71
comfy/ops.py
71
comfy/ops.py
|
|
@ -777,8 +777,16 @@ from .quant_ops import (
|
|||
|
||||
|
||||
class QuantLinearFunc(torch.autograd.Function):
|
||||
"""Custom autograd function for quantized linear: quantized forward, compute_dtype backward.
|
||||
Handles any input rank by flattening to 2D for matmul and restoring shape after.
|
||||
"""Custom autograd function for quantized linear: quantized forward, optionally FP8 backward.
|
||||
|
||||
When training_fp8_bwd is enabled:
|
||||
- Forward: quantize input per layout (FP8/NVFP4), use quantized matmul
|
||||
- Backward: all matmuls use FP8 tensor cores via torch.mm dispatch
|
||||
- Cached input is FP8 (half the memory of bf16)
|
||||
|
||||
When training_fp8_bwd is disabled:
|
||||
- Forward: quantize input per layout, use quantized matmul
|
||||
- Backward: dequantize weight to compute_dtype, use standard matmul
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -786,7 +794,7 @@ class QuantLinearFunc(torch.autograd.Function):
|
|||
input_shape = input_float.shape
|
||||
inp = input_float.detach().flatten(0, -2) # zero-cost view to 2D
|
||||
|
||||
# Quantize input (same as inference path)
|
||||
# Quantize input for forward (same layout as weight)
|
||||
if layout_type is not None:
|
||||
q_input = QuantizedTensor.from_float(inp, layout_type, scale=input_scale)
|
||||
else:
|
||||
|
|
@ -797,43 +805,68 @@ class QuantLinearFunc(torch.autograd.Function):
|
|||
|
||||
output = torch.nn.functional.linear(q_input, w, b)
|
||||
|
||||
# Restore original input shape
|
||||
# Unflatten output to match original input shape
|
||||
if len(input_shape) > 2:
|
||||
output = output.unflatten(0, input_shape[:-1])
|
||||
|
||||
ctx.save_for_backward(input_float, weight)
|
||||
# Save for backward
|
||||
ctx.input_shape = input_shape
|
||||
ctx.has_bias = bias is not None
|
||||
ctx.compute_dtype = compute_dtype
|
||||
ctx.weight_requires_grad = weight.requires_grad
|
||||
ctx.fp8_bwd = comfy.model_management.training_fp8_bwd
|
||||
|
||||
if ctx.fp8_bwd:
|
||||
# Cache FP8 quantized input — half the memory of bf16
|
||||
if isinstance(q_input, QuantizedTensor) and layout_type.startswith('TensorCoreFP8'):
|
||||
ctx.q_input = q_input # already FP8, reuse
|
||||
else:
|
||||
# NVFP4 or other layout — quantize input to FP8 for backward
|
||||
ctx.q_input = QuantizedTensor.from_float(inp, "TensorCoreFP8E4M3Layout")
|
||||
ctx.save_for_backward(weight)
|
||||
else:
|
||||
ctx.q_input = None
|
||||
ctx.save_for_backward(input_float, weight)
|
||||
|
||||
return output
|
||||
|
||||
@staticmethod
|
||||
@torch.autograd.function.once_differentiable
|
||||
def backward(ctx, grad_output):
|
||||
input_float, weight = ctx.saved_tensors
|
||||
compute_dtype = ctx.compute_dtype
|
||||
grad_2d = grad_output.flatten(0, -2).to(compute_dtype)
|
||||
|
||||
# Dequantize weight to compute dtype for backward matmul
|
||||
if isinstance(weight, QuantizedTensor):
|
||||
weight_f = weight.dequantize().to(compute_dtype)
|
||||
# Value casting — only difference between fp8 and non-fp8 paths
|
||||
if ctx.fp8_bwd:
|
||||
weight, = ctx.saved_tensors
|
||||
# Wrap as FP8 QuantizedTensors → torch.mm dispatches to _scaled_mm
|
||||
grad_mm = QuantizedTensor.from_float(grad_2d, "TensorCoreFP8E5M2Layout")
|
||||
if isinstance(weight, QuantizedTensor) and weight._layout_cls.startswith("TensorCoreFP8"):
|
||||
weight_mm = weight
|
||||
elif isinstance(weight, QuantizedTensor):
|
||||
weight_mm = QuantizedTensor.from_float(weight.dequantize().to(compute_dtype), "TensorCoreFP8E4M3Layout")
|
||||
else:
|
||||
weight_mm = QuantizedTensor.from_float(weight.to(compute_dtype), "TensorCoreFP8E4M3Layout")
|
||||
input_mm = ctx.q_input
|
||||
else:
|
||||
weight_f = weight.to(compute_dtype)
|
||||
input_float, weight = ctx.saved_tensors
|
||||
# Standard tensors → torch.mm does regular matmul
|
||||
grad_mm = grad_2d
|
||||
if isinstance(weight, QuantizedTensor):
|
||||
weight_mm = weight.dequantize().to(compute_dtype)
|
||||
else:
|
||||
weight_mm = weight.to(compute_dtype)
|
||||
input_mm = input_float.flatten(0, -2).to(compute_dtype) if ctx.weight_requires_grad else None
|
||||
|
||||
# grad_input = grad_output @ weight
|
||||
grad_input = torch.mm(grad_2d, weight_f)
|
||||
# Computation — same for both paths, dispatch handles the rest
|
||||
grad_input = torch.mm(grad_mm, weight_mm)
|
||||
if len(ctx.input_shape) > 2:
|
||||
grad_input = grad_input.unflatten(0, ctx.input_shape[:-1])
|
||||
|
||||
# grad_weight (only if weight requires grad, typically frozen for quantized training)
|
||||
grad_weight = None
|
||||
if ctx.weight_requires_grad:
|
||||
input_f = input_float.flatten(0, -2).to(compute_dtype)
|
||||
grad_weight = torch.mm(grad_2d.t(), input_f)
|
||||
grad_weight = torch.mm(grad_mm.t(), input_mm)
|
||||
|
||||
# grad_bias
|
||||
grad_bias = None
|
||||
if ctx.has_bias:
|
||||
grad_bias = grad_2d.sum(dim=0)
|
||||
|
|
@ -895,6 +928,7 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
|
|||
weight = state_dict.pop(weight_key, None)
|
||||
if weight is None:
|
||||
logging.warning(f"Missing weight for layer {layer_name}")
|
||||
self.weight = None
|
||||
return
|
||||
|
||||
manually_loaded_keys = [weight_key]
|
||||
|
|
@ -1001,6 +1035,9 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
|
|||
if self.bias is not None:
|
||||
sd["{}bias".format(prefix)] = self.bias
|
||||
|
||||
if self.weight is None:
|
||||
return sd
|
||||
|
||||
if isinstance(self.weight, QuantizedTensor):
|
||||
sd_out = self.weight.state_dict("{}weight".format(prefix))
|
||||
for k in sd_out:
|
||||
|
|
@ -1114,7 +1151,7 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
|
|||
if param is None:
|
||||
continue
|
||||
p = fn(param)
|
||||
if p.is_inference():
|
||||
if (not torch.is_inference_mode_enabled()) and p.is_inference():
|
||||
p = p.clone()
|
||||
self.register_parameter(key, torch.nn.Parameter(p, requires_grad=False))
|
||||
for key, buf in self._buffers.items():
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import comfy.model_management
|
|||
import comfy.memory_management
|
||||
import comfy_aimdo.host_buffer
|
||||
import comfy_aimdo.torch
|
||||
import psutil
|
||||
|
||||
from comfy.cli_args import args
|
||||
|
||||
|
|
@ -12,6 +13,11 @@ def pin_memory(module):
|
|||
if module.pin_failed or args.disable_pinned_memory or get_pin(module) is not None:
|
||||
return
|
||||
#FIXME: This is a RAM cache trigger event
|
||||
ram_headroom = comfy.memory_management.RAM_CACHE_HEADROOM
|
||||
#we split the difference and assume half the RAM cache headroom is for us
|
||||
if ram_headroom > 0 and psutil.virtual_memory().available < (ram_headroom * 0.5):
|
||||
comfy.memory_management.extra_ram_release(ram_headroom)
|
||||
|
||||
size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
|
||||
|
||||
if comfy.model_management.MAX_PINNED_MEMORY <= 0 or (comfy.model_management.TOTAL_PINNED_MEMORY + size) > comfy.model_management.MAX_PINNED_MEMORY:
|
||||
|
|
|
|||
86
comfy/sd.py
86
comfy/sd.py
|
|
@ -12,6 +12,7 @@ from .ldm.cascade.stage_c_coder import StageC_coder
|
|||
from .ldm.audio.autoencoder import AudioOobleckVAE
|
||||
import comfy.ldm.genmo.vae.model
|
||||
import comfy.ldm.lightricks.vae.causal_video_autoencoder
|
||||
import comfy.ldm.lightricks.vae.audio_vae
|
||||
import comfy.ldm.cosmos.vae
|
||||
import comfy.ldm.wan.vae
|
||||
import comfy.ldm.wan.vae2_2
|
||||
|
|
@ -61,6 +62,8 @@ import comfy.text_encoders.newbie
|
|||
import comfy.text_encoders.anima
|
||||
import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
import comfy.text_encoders.qwen35
|
||||
import comfy.text_encoders.ernie
|
||||
|
||||
import comfy.model_patcher
|
||||
import comfy.lora
|
||||
|
|
@ -279,9 +282,6 @@ class CLIP:
|
|||
n.apply_hooks_to_conds = self.apply_hooks_to_conds
|
||||
return n
|
||||
|
||||
def get_ram_usage(self):
|
||||
return self.patcher.get_ram_usage()
|
||||
|
||||
def add_patches(self, patches, strength_patch=1.0, strength_model=1.0):
|
||||
return self.patcher.add_patches(patches, strength_patch, strength_model)
|
||||
|
||||
|
|
@ -425,13 +425,13 @@ class CLIP:
|
|||
def get_key_patches(self):
|
||||
return self.patcher.get_key_patches()
|
||||
|
||||
def generate(self, tokens, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.95, min_p=0.0, repetition_penalty=1.0, seed=None):
|
||||
def generate(self, tokens, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.95, min_p=0.0, repetition_penalty=1.0, seed=None, presence_penalty=0.0):
|
||||
self.cond_stage_model.reset_clip_options()
|
||||
|
||||
self.load_model(tokens)
|
||||
self.cond_stage_model.set_clip_options({"layer": None})
|
||||
self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device})
|
||||
return self.cond_stage_model.generate(tokens, do_sample=do_sample, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=repetition_penalty, seed=seed)
|
||||
return self.cond_stage_model.generate(tokens, do_sample=do_sample, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=repetition_penalty, seed=seed, presence_penalty=presence_penalty)
|
||||
|
||||
def decode(self, token_ids, skip_special_tokens=True):
|
||||
return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
|
||||
|
|
@ -558,12 +558,19 @@ class VAE:
|
|||
old_memory_used_decode = self.memory_used_decode
|
||||
self.memory_used_decode = lambda shape, dtype: old_memory_used_decode(shape, dtype) * 4.0
|
||||
|
||||
decoder_ch = sd['decoder.conv_in.weight'].shape[0] // ddconfig['ch_mult'][-1]
|
||||
if decoder_ch != ddconfig['ch']:
|
||||
decoder_ddconfig = ddconfig.copy()
|
||||
decoder_ddconfig['ch'] = decoder_ch
|
||||
else:
|
||||
decoder_ddconfig = None
|
||||
|
||||
if 'post_quant_conv.weight' in sd:
|
||||
self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1])
|
||||
self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1], **({"decoder_ddconfig": decoder_ddconfig} if decoder_ddconfig is not None else {}))
|
||||
else:
|
||||
self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"},
|
||||
encoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Encoder", 'params': ddconfig},
|
||||
decoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Decoder", 'params': ddconfig})
|
||||
decoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Decoder", 'params': decoder_ddconfig if decoder_ddconfig is not None else ddconfig})
|
||||
elif "decoder.layers.1.layers.0.beta" in sd:
|
||||
config = {}
|
||||
param_key = None
|
||||
|
|
@ -799,6 +806,24 @@ class VAE:
|
|||
self.downscale_index_formula = (4, 8, 8)
|
||||
self.memory_used_encode = lambda shape, dtype: (700 * (max(1, (shape[-3] ** 0.66 * 0.11)) * shape[-2] * shape[-1]) * model_management.dtype_size(dtype))
|
||||
self.memory_used_decode = lambda shape, dtype: (50 * (max(1, (shape[-3] ** 0.65 * 0.26)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype))
|
||||
elif "vocoder.resblocks.0.convs1.0.weight" in sd or "vocoder.vocoder.resblocks.0.convs1.0.weight" in sd: # LTX Audio
|
||||
sd = comfy.utils.state_dict_prefix_replace(sd, {"audio_vae.": "autoencoder."})
|
||||
self.first_stage_model = comfy.ldm.lightricks.vae.audio_vae.AudioVAE(metadata=metadata)
|
||||
self.memory_used_encode = lambda shape, dtype: (shape[2] * 330) * model_management.dtype_size(dtype)
|
||||
self.memory_used_decode = lambda shape, dtype: (shape[2] * shape[3] * 87000) * model_management.dtype_size(dtype)
|
||||
self.latent_channels = self.first_stage_model.latent_channels
|
||||
self.audio_sample_rate_output = self.first_stage_model.output_sample_rate
|
||||
self.autoencoder = self.first_stage_model.autoencoder # TODO: remove hack for ltxv custom nodes
|
||||
self.output_channels = 2
|
||||
self.pad_channel_value = "replicate"
|
||||
self.upscale_ratio = 4096
|
||||
self.downscale_ratio = 4096
|
||||
self.latent_dim = 2
|
||||
self.process_output = lambda audio: audio
|
||||
self.process_input = lambda audio: audio
|
||||
self.working_dtypes = [torch.float32]
|
||||
self.disable_offload = True
|
||||
self.extra_1d_channel = 16
|
||||
else:
|
||||
logging.warning("WARNING: No VAE weights detected, VAE not initalized.")
|
||||
self.first_stage_model = None
|
||||
|
|
@ -839,9 +864,6 @@ class VAE:
|
|||
self.size = comfy.model_management.module_size(self.first_stage_model)
|
||||
return self.size
|
||||
|
||||
def get_ram_usage(self):
|
||||
return self.model_size()
|
||||
|
||||
def throw_exception_if_invalid(self):
|
||||
if self.first_stage_model is None:
|
||||
raise RuntimeError("ERROR: VAE is invalid: None\n\nIf the VAE is from a checkpoint loader node your checkpoint does not contain a valid VAE.")
|
||||
|
|
@ -1228,6 +1250,12 @@ class TEModel(Enum):
|
|||
QWEN3_8B = 20
|
||||
QWEN3_06B = 21
|
||||
GEMMA_3_4B_VISION = 22
|
||||
QWEN35_08B = 23
|
||||
QWEN35_2B = 24
|
||||
QWEN35_4B = 25
|
||||
QWEN35_9B = 26
|
||||
QWEN35_27B = 27
|
||||
MINISTRAL_3_3B = 28
|
||||
|
||||
|
||||
def detect_te_model(sd):
|
||||
|
|
@ -1267,6 +1295,17 @@ def detect_te_model(sd):
|
|||
return TEModel.QWEN25_3B
|
||||
if weight.shape[0] == 512:
|
||||
return TEModel.QWEN25_7B
|
||||
if "model.language_model.layers.0.linear_attn.A_log" in sd and "model.language_model.layers.0.input_layernorm.weight" in sd:
|
||||
weight = sd['model.language_model.layers.0.input_layernorm.weight']
|
||||
if weight.shape[0] == 1024:
|
||||
return TEModel.QWEN35_08B
|
||||
if weight.shape[0] == 2560:
|
||||
return TEModel.QWEN35_4B
|
||||
if weight.shape[0] == 4096:
|
||||
return TEModel.QWEN35_9B
|
||||
if weight.shape[0] == 5120:
|
||||
return TEModel.QWEN35_27B
|
||||
return TEModel.QWEN35_2B
|
||||
if "model.layers.0.post_attention_layernorm.weight" in sd:
|
||||
weight = sd['model.layers.0.post_attention_layernorm.weight']
|
||||
if 'model.layers.0.self_attn.q_norm.weight' in sd:
|
||||
|
|
@ -1283,6 +1322,8 @@ def detect_te_model(sd):
|
|||
return TEModel.MISTRAL3_24B
|
||||
else:
|
||||
return TEModel.MISTRAL3_24B_PRUNED_FLUX2
|
||||
if weight.shape[0] == 3072:
|
||||
return TEModel.MINISTRAL_3_3B
|
||||
|
||||
return TEModel.LLAMA3_8
|
||||
return None
|
||||
|
|
@ -1299,11 +1340,12 @@ def t5xxl_detect(clip_data):
|
|||
return {}
|
||||
|
||||
def llama_detect(clip_data):
|
||||
weight_name = "model.layers.0.self_attn.k_proj.weight"
|
||||
weight_names = ["model.layers.0.self_attn.k_proj.weight", "model.layers.0.linear_attn.in_proj_a.weight"]
|
||||
|
||||
for sd in clip_data:
|
||||
if weight_name in sd:
|
||||
return comfy.text_encoders.hunyuan_video.llama_detect(sd)
|
||||
for weight_name in weight_names:
|
||||
if weight_name in sd:
|
||||
return comfy.text_encoders.hunyuan_video.llama_detect(sd)
|
||||
|
||||
return {}
|
||||
|
||||
|
|
@ -1431,9 +1473,18 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
|||
elif te_model == TEModel.JINA_CLIP_2:
|
||||
clip_target.clip = comfy.text_encoders.jina_clip_2.JinaClip2TextModelWrapper
|
||||
clip_target.tokenizer = comfy.text_encoders.jina_clip_2.JinaClip2TokenizerWrapper
|
||||
elif te_model in (TEModel.QWEN35_08B, TEModel.QWEN35_2B, TEModel.QWEN35_4B, TEModel.QWEN35_9B, TEModel.QWEN35_27B):
|
||||
clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
|
||||
qwen35_type = {TEModel.QWEN35_08B: "qwen35_08b", TEModel.QWEN35_2B: "qwen35_2b", TEModel.QWEN35_4B: "qwen35_4b", TEModel.QWEN35_9B: "qwen35_9b", TEModel.QWEN35_27B: "qwen35_27b"}[te_model]
|
||||
clip_target.clip = comfy.text_encoders.qwen35.te(**llama_detect(clip_data), model_type=qwen35_type)
|
||||
clip_target.tokenizer = comfy.text_encoders.qwen35.tokenizer(model_type=qwen35_type)
|
||||
elif te_model == TEModel.QWEN3_06B:
|
||||
clip_target.clip = comfy.text_encoders.anima.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.anima.AnimaTokenizer
|
||||
elif te_model == TEModel.MINISTRAL_3_3B:
|
||||
clip_target.clip = comfy.text_encoders.ernie.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.ernie.ErnieTokenizer
|
||||
tokenizer_data["tekken_model"] = clip_data[0].get("tekken_model", None)
|
||||
else:
|
||||
# clip_l
|
||||
if clip_type == CLIPType.SD3:
|
||||
|
|
@ -1719,15 +1770,18 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None, disable
|
|||
"""
|
||||
dtype = model_options.get("dtype", None)
|
||||
|
||||
custom_operations = model_options.get("custom_operations", None)
|
||||
if custom_operations is None:
|
||||
sd, metadata = comfy.utils.convert_old_quants(sd, "", metadata=metadata)
|
||||
|
||||
#Allow loading unets from checkpoint files
|
||||
diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd)
|
||||
temp_sd = comfy.utils.state_dict_prefix_replace(sd, {diffusion_model_prefix: ""}, filter_keys=True)
|
||||
if len(temp_sd) > 0:
|
||||
sd = temp_sd
|
||||
if custom_operations is None:
|
||||
sd, metadata = comfy.utils.convert_old_quants(sd, "", metadata=metadata)
|
||||
|
||||
custom_operations = model_options.get("custom_operations", None)
|
||||
if custom_operations is None:
|
||||
sd, metadata = comfy.utils.convert_old_quants(sd, "", metadata=metadata)
|
||||
parameters = comfy.utils.calculate_parameters(sd)
|
||||
weight_dtype = comfy.utils.weight_dtype(sd)
|
||||
|
||||
|
|
|
|||
|
|
@ -308,14 +308,14 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
|
|||
def load_sd(self, sd):
|
||||
return self.transformer.load_state_dict(sd, strict=False, assign=getattr(self, "can_assign_sd", False))
|
||||
|
||||
def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed):
|
||||
def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty=0.0):
|
||||
if isinstance(tokens, dict):
|
||||
tokens_only = next(iter(tokens.values())) # todo: get this better?
|
||||
else:
|
||||
tokens_only = tokens
|
||||
tokens_only = [[t[0] for t in b] for b in tokens_only]
|
||||
embeds = self.process_tokens(tokens_only, device=self.execution_device)[0]
|
||||
return self.transformer.generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed)
|
||||
return self.transformer.generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty=presence_penalty)
|
||||
|
||||
def parse_parentheses(string):
|
||||
result = []
|
||||
|
|
@ -740,5 +740,5 @@ class SD1ClipModel(torch.nn.Module):
|
|||
def load_sd(self, sd):
|
||||
return getattr(self, self.clip).load_sd(sd)
|
||||
|
||||
def generate(self, tokens, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.95, min_p=0.0, repetition_penalty=1.0, seed=None):
|
||||
return getattr(self, self.clip).generate(tokens, do_sample=do_sample, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=repetition_penalty, seed=seed)
|
||||
def generate(self, tokens, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.95, min_p=0.0, repetition_penalty=1.0, seed=None, presence_penalty=0.0):
|
||||
return getattr(self, self.clip).generate(tokens, do_sample=do_sample, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=repetition_penalty, seed=seed, presence_penalty=presence_penalty)
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ import comfy.text_encoders.z_image
|
|||
import comfy.text_encoders.anima
|
||||
import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
import comfy.text_encoders.ernie
|
||||
|
||||
from . import supported_models_base
|
||||
from . import latent_formats
|
||||
|
|
@ -1734,6 +1735,103 @@ class LongCatImage(supported_models_base.BASE):
|
|||
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.longcat_image.LongCatImageTokenizer, comfy.text_encoders.longcat_image.te(**hunyuan_detect))
|
||||
|
||||
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima]
|
||||
|
||||
class RT_DETR_v4(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "RT_DETR_v4",
|
||||
}
|
||||
|
||||
supported_inference_dtypes = [torch.float16, torch.float32]
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
out = model_base.RT_DETR_v4(self, device=device)
|
||||
return out
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
return None
|
||||
|
||||
|
||||
class ErnieImage(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "ernie",
|
||||
}
|
||||
|
||||
sampling_settings = {
|
||||
"multiplier": 1000.0,
|
||||
"shift": 3.0,
|
||||
}
|
||||
|
||||
memory_usage_factor = 10.0
|
||||
|
||||
unet_extra_config = {}
|
||||
latent_format = latent_formats.Flux2
|
||||
|
||||
supported_inference_dtypes = [torch.bfloat16, torch.float32]
|
||||
|
||||
vae_key_prefix = ["vae."]
|
||||
text_encoder_key_prefix = ["text_encoders."]
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
out = model_base.ErnieImage(self, device=device)
|
||||
return out
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
pref = self.text_encoder_key_prefix[0]
|
||||
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}ministral3_3b.transformer.".format(pref))
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.ernie.ErnieTokenizer, comfy.text_encoders.ernie.te(**hunyuan_detect))
|
||||
|
||||
|
||||
class SAM3(supported_models_base.BASE):
|
||||
unet_config = {"image_model": "SAM3"}
|
||||
supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
|
||||
text_encoder_key_prefix = ["detector.backbone.language_backbone."]
|
||||
unet_extra_prefix = ""
|
||||
|
||||
def process_clip_state_dict(self, state_dict):
|
||||
clip_keys = getattr(self, "_clip_stash", {})
|
||||
clip_keys = utils.state_dict_prefix_replace(clip_keys, {"detector.backbone.language_backbone.": "", "backbone.language_backbone.": ""}, filter_keys=True)
|
||||
clip_keys = utils.clip_text_transformers_convert(clip_keys, "encoder.", "sam3_clip.transformer.")
|
||||
return {k: v for k, v in clip_keys.items() if not k.startswith("resizer.")}
|
||||
|
||||
def process_unet_state_dict(self, state_dict):
|
||||
self._clip_stash = {k: state_dict.pop(k) for k in list(state_dict.keys()) if "language_backbone" in k and "resizer" not in k}
|
||||
# SAM3.1: remap tracker.model.* -> tracker.*
|
||||
for k in list(state_dict.keys()):
|
||||
if k.startswith("tracker.model."):
|
||||
state_dict["tracker." + k[len("tracker.model."):]] = state_dict.pop(k)
|
||||
# SAM3.1: remove per-block freqs_cis buffers (computed dynamically)
|
||||
for k in [k for k in list(state_dict.keys()) if ".attn.freqs_cis" in k]:
|
||||
state_dict.pop(k)
|
||||
# Split fused QKV projections
|
||||
for k in [k for k in list(state_dict.keys()) if k.endswith((".in_proj_weight", ".in_proj_bias"))]:
|
||||
t = state_dict.pop(k)
|
||||
base, suffix = k.rsplit(".in_proj_", 1)
|
||||
s = ".weight" if suffix == "weight" else ".bias"
|
||||
d = t.shape[0] // 3
|
||||
state_dict[base + ".q_proj" + s] = t[:d]
|
||||
state_dict[base + ".k_proj" + s] = t[d:2*d]
|
||||
state_dict[base + ".v_proj" + s] = t[2*d:]
|
||||
# Remap tracker SAM decoder transformer key names to match sam.py TwoWayTransformer
|
||||
for k in list(state_dict.keys()):
|
||||
if "sam_mask_decoder.transformer." not in k:
|
||||
continue
|
||||
new_k = k.replace(".mlp.lin1.", ".mlp.0.").replace(".mlp.lin2.", ".mlp.2.").replace(".norm_final_attn.", ".norm_final.")
|
||||
if new_k != k:
|
||||
state_dict[new_k] = state_dict.pop(k)
|
||||
return state_dict
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
return model_base.SAM3(self, device=device)
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
import comfy.text_encoders.sam3_clip
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.sam3_clip.SAM3TokenizerWrapper, comfy.text_encoders.sam3_clip.SAM3ClipModelWrapper)
|
||||
|
||||
|
||||
class SAM31(SAM3):
|
||||
unet_config = {"image_model": "SAM31"}
|
||||
|
||||
|
||||
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima, RT_DETR_v4, ErnieImage, SAM3, SAM31]
|
||||
|
||||
models += [SVD_img2vid]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,38 @@
|
|||
from .flux import Mistral3Tokenizer
|
||||
from comfy import sd1_clip
|
||||
import comfy.text_encoders.llama
|
||||
|
||||
class Ministral3_3BTokenizer(Mistral3Tokenizer):
|
||||
def __init__(self, embedding_directory=None, embedding_size=5120, embedding_key='ministral3_3b', tokenizer_data={}):
|
||||
return super().__init__(embedding_directory=embedding_directory, embedding_size=embedding_size, embedding_key=embedding_key, tokenizer_data=tokenizer_data)
|
||||
|
||||
class ErnieTokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="ministral3_3b", tokenizer=Mistral3Tokenizer)
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs):
|
||||
tokens = super().tokenize_with_weights(text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
|
||||
return tokens
|
||||
|
||||
|
||||
class Ministral3_3BModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}):
|
||||
textmodel_json_config = {}
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 1, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Ministral3_3B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
|
||||
class ErnieTEModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}, name="ministral3_3b", clip_model=Ministral3_3BModel):
|
||||
super().__init__(device=device, dtype=dtype, name=name, clip_model=clip_model, model_options=model_options)
|
||||
|
||||
|
||||
def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||
class ErnieTEModel_(ErnieTEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
return ErnieTEModel_
|
||||
|
|
@ -116,9 +116,9 @@ class MistralTokenizerClass:
|
|||
return LlamaTokenizerFast(**kwargs)
|
||||
|
||||
class Mistral3Tokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
def __init__(self, embedding_directory=None, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_data={}):
|
||||
self.tekken_data = tokenizer_data.get("tekken_model", None)
|
||||
super().__init__("", pad_with_end=False, embedding_directory=embedding_directory, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_class=MistralTokenizerClass, has_end_token=False, pad_to_max_length=False, pad_token=11, start_token=1, max_length=99999999, min_length=1, pad_left=True, tokenizer_args=load_mistral_tokenizer(self.tekken_data), tokenizer_data=tokenizer_data)
|
||||
super().__init__("", pad_with_end=False, embedding_directory=embedding_directory, embedding_size=embedding_size, embedding_key=embedding_key, tokenizer_class=MistralTokenizerClass, has_end_token=False, pad_to_max_length=False, pad_token=11, start_token=1, max_length=99999999, min_length=1, pad_left=True, disable_weights=True, tokenizer_args=load_mistral_tokenizer(self.tekken_data), tokenizer_data=tokenizer_data)
|
||||
|
||||
def state_dict(self):
|
||||
return {"tekken_model": self.tekken_data}
|
||||
|
|
|
|||
|
|
@ -60,6 +60,30 @@ class Mistral3Small24BConfig:
|
|||
final_norm: bool = True
|
||||
lm_head: bool = False
|
||||
|
||||
@dataclass
|
||||
class Ministral3_3BConfig:
|
||||
vocab_size: int = 131072
|
||||
hidden_size: int = 3072
|
||||
intermediate_size: int = 9216
|
||||
num_hidden_layers: int = 26
|
||||
num_attention_heads: int = 32
|
||||
num_key_value_heads: int = 8
|
||||
max_position_embeddings: int = 262144
|
||||
rms_norm_eps: float = 1e-5
|
||||
rope_theta: float = 1000000.0
|
||||
transformer_type: str = "llama"
|
||||
head_dim = 128
|
||||
rms_norm_add = False
|
||||
mlp_activation = "silu"
|
||||
qkv_bias = False
|
||||
rope_dims = None
|
||||
q_norm = None
|
||||
k_norm = None
|
||||
rope_scale = None
|
||||
final_norm: bool = True
|
||||
lm_head: bool = False
|
||||
stop_tokens = [2]
|
||||
|
||||
@dataclass
|
||||
class Qwen25_3BConfig:
|
||||
vocab_size: int = 151936
|
||||
|
|
@ -224,7 +248,7 @@ class Qwen3_8BConfig:
|
|||
k_norm = "gemma3"
|
||||
rope_scale = None
|
||||
final_norm: bool = True
|
||||
lm_head: bool = False
|
||||
lm_head: bool = True
|
||||
stop_tokens = [151643, 151645]
|
||||
|
||||
@dataclass
|
||||
|
|
@ -655,6 +679,17 @@ class Llama2_(nn.Module):
|
|||
if config.lm_head:
|
||||
self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)
|
||||
|
||||
def get_past_len(self, past_key_values):
|
||||
return past_key_values[0][2]
|
||||
|
||||
def compute_freqs_cis(self, position_ids, device):
|
||||
return precompute_freqs_cis(self.config.head_dim,
|
||||
position_ids,
|
||||
self.config.rope_theta,
|
||||
self.config.rope_scale,
|
||||
self.config.rope_dims,
|
||||
device=device)
|
||||
|
||||
def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[], past_key_values=None):
|
||||
if embeds is not None:
|
||||
x = embeds
|
||||
|
|
@ -667,17 +702,12 @@ class Llama2_(nn.Module):
|
|||
seq_len = x.shape[1]
|
||||
past_len = 0
|
||||
if past_key_values is not None and len(past_key_values) > 0:
|
||||
past_len = past_key_values[0][2]
|
||||
past_len = self.get_past_len(past_key_values)
|
||||
|
||||
if position_ids is None:
|
||||
position_ids = torch.arange(past_len, past_len + seq_len, device=x.device).unsqueeze(0)
|
||||
|
||||
freqs_cis = precompute_freqs_cis(self.config.head_dim,
|
||||
position_ids,
|
||||
self.config.rope_theta,
|
||||
self.config.rope_scale,
|
||||
self.config.rope_dims,
|
||||
device=x.device)
|
||||
freqs_cis = self.compute_freqs_cis(position_ids, x.device)
|
||||
|
||||
mask = None
|
||||
if attention_mask is not None:
|
||||
|
|
@ -812,9 +842,16 @@ class BaseGenerate:
|
|||
comfy.ops.uncast_bias_weight(module, weight, None, offload_stream)
|
||||
return x
|
||||
|
||||
def generate(self, embeds=None, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.9, min_p=0.0, repetition_penalty=1.0, seed=42, stop_tokens=None, initial_tokens=[], execution_dtype=None, min_tokens=0):
|
||||
device = embeds.device
|
||||
def init_kv_cache(self, batch, max_cache_len, device, execution_dtype):
|
||||
model_config = self.model.config
|
||||
past_key_values = []
|
||||
for x in range(model_config.num_hidden_layers):
|
||||
past_key_values.append((torch.empty([batch, model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype),
|
||||
torch.empty([batch, model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype), 0))
|
||||
return past_key_values
|
||||
|
||||
def generate(self, embeds=None, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.9, min_p=0.0, repetition_penalty=1.0, seed=42, stop_tokens=None, initial_tokens=[], execution_dtype=None, min_tokens=0, presence_penalty=0.0):
|
||||
device = embeds.device
|
||||
|
||||
if stop_tokens is None:
|
||||
stop_tokens = self.model.config.stop_tokens
|
||||
|
|
@ -829,11 +866,8 @@ class BaseGenerate:
|
|||
if embeds.ndim == 2:
|
||||
embeds = embeds.unsqueeze(0)
|
||||
|
||||
past_key_values = [] #kv_cache init
|
||||
max_cache_len = embeds.shape[1] + max_length
|
||||
for x in range(model_config.num_hidden_layers):
|
||||
past_key_values.append((torch.empty([embeds.shape[0], model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype),
|
||||
torch.empty([embeds.shape[0], model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype), 0))
|
||||
past_key_values = self.init_kv_cache(embeds.shape[0], max_cache_len, device, execution_dtype)
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(seed) if do_sample else None
|
||||
|
||||
|
|
@ -844,7 +878,7 @@ class BaseGenerate:
|
|||
for step in tqdm(range(max_length), desc="Generating tokens"):
|
||||
x, _, past_key_values = self.model.forward(None, embeds=embeds, attention_mask=None, past_key_values=past_key_values)
|
||||
logits = self.logits(x)[:, -1]
|
||||
next_token = self.sample_token(logits, temperature, top_k, top_p, min_p, repetition_penalty, initial_tokens + generated_token_ids, generator, do_sample=do_sample)
|
||||
next_token = self.sample_token(logits, temperature, top_k, top_p, min_p, repetition_penalty, initial_tokens + generated_token_ids, generator, do_sample=do_sample, presence_penalty=presence_penalty)
|
||||
token_id = next_token[0].item()
|
||||
generated_token_ids.append(token_id)
|
||||
|
||||
|
|
@ -856,7 +890,7 @@ class BaseGenerate:
|
|||
|
||||
return generated_token_ids
|
||||
|
||||
def sample_token(self, logits, temperature, top_k, top_p, min_p, repetition_penalty, token_history, generator, do_sample=True):
|
||||
def sample_token(self, logits, temperature, top_k, top_p, min_p, repetition_penalty, token_history, generator, do_sample=True, presence_penalty=0.0):
|
||||
|
||||
if not do_sample or temperature == 0.0:
|
||||
return torch.argmax(logits, dim=-1, keepdim=True)
|
||||
|
|
@ -867,6 +901,11 @@ class BaseGenerate:
|
|||
for token_id in set(token_history):
|
||||
logits[i, token_id] *= repetition_penalty if logits[i, token_id] < 0 else 1/repetition_penalty
|
||||
|
||||
if presence_penalty is not None and presence_penalty != 0.0:
|
||||
for i in range(logits.shape[0]):
|
||||
for token_id in set(token_history):
|
||||
logits[i, token_id] -= presence_penalty
|
||||
|
||||
if temperature != 1.0:
|
||||
logits = logits / temperature
|
||||
|
||||
|
|
@ -897,6 +936,9 @@ class BaseGenerate:
|
|||
class BaseQwen3:
|
||||
def logits(self, x):
|
||||
input = x[:, -1:]
|
||||
if self.model.config.lm_head:
|
||||
return self.model.lm_head(input)
|
||||
|
||||
module = self.model.embed_tokens
|
||||
|
||||
offload_stream = None
|
||||
|
|
@ -928,6 +970,15 @@ class Mistral3Small24B(BaseLlama, torch.nn.Module):
|
|||
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
|
||||
self.dtype = dtype
|
||||
|
||||
class Ministral3_3B(BaseLlama, BaseQwen3, BaseGenerate, torch.nn.Module):
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
super().__init__()
|
||||
config = Ministral3_3BConfig(**config_dict)
|
||||
self.num_layers = config.num_hidden_layers
|
||||
|
||||
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
|
||||
self.dtype = dtype
|
||||
|
||||
class Qwen25_3B(BaseLlama, torch.nn.Module):
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
super().__init__()
|
||||
|
|
@ -1028,12 +1079,19 @@ class Qwen25_7BVLI(BaseLlama, BaseGenerate, torch.nn.Module):
|
|||
grid = e.get("extra", None)
|
||||
start = e.get("index")
|
||||
if position_ids is None:
|
||||
position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device)
|
||||
position_ids = torch.ones((3, embeds.shape[1]), device=embeds.device, dtype=torch.long)
|
||||
position_ids[:, :start] = torch.arange(0, start, device=embeds.device)
|
||||
end = e.get("size") + start
|
||||
len_max = int(grid.max()) // 2
|
||||
start_next = len_max + start
|
||||
position_ids[:, end:] = torch.arange(start_next + offset, start_next + (embeds.shape[1] - end) + offset, device=embeds.device)
|
||||
if attention_mask is not None:
|
||||
# Assign compact sequential positions to attended tokens only,
|
||||
# skipping over padding so post-padding tokens aren't inflated.
|
||||
after_mask = attention_mask[0, end:]
|
||||
text_positions = after_mask.cumsum(0) - 1 + start_next + offset
|
||||
position_ids[:, end:] = torch.where(after_mask.bool(), text_positions, position_ids[0, end:])
|
||||
else:
|
||||
position_ids[:, end:] = torch.arange(start_next + offset, start_next + (embeds.shape[1] - end) + offset, device=embeds.device)
|
||||
position_ids[0, start:end] = start + offset
|
||||
max_d = int(grid[0][1]) // 2
|
||||
position_ids[1, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
|
||||
|
|
|
|||
|
|
@ -64,7 +64,13 @@ class LongCatImageBaseTokenizer(Qwen25_7BVLITokenizer):
|
|||
return [output]
|
||||
|
||||
|
||||
IMAGE_PAD_TOKEN_ID = 151655
|
||||
|
||||
class LongCatImageTokenizer(sd1_clip.SD1Tokenizer):
|
||||
T2I_PREFIX = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n"
|
||||
EDIT_PREFIX = "<|im_start|>system\nAs an image editing expert, first analyze the content and attributes of the input image(s). Then, based on the user's editing instructions, clearly and precisely determine how to modify the given image(s), ensuring that only the specified parts are altered and all other aspects remain consistent with the original(s).<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
|
||||
SUFFIX = "<|im_end|>\n<|im_start|>assistant\n"
|
||||
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(
|
||||
embedding_directory=embedding_directory,
|
||||
|
|
@ -72,10 +78,8 @@ class LongCatImageTokenizer(sd1_clip.SD1Tokenizer):
|
|||
name="qwen25_7b",
|
||||
tokenizer=LongCatImageBaseTokenizer,
|
||||
)
|
||||
self.longcat_template_prefix = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n"
|
||||
self.longcat_template_suffix = "<|im_end|>\n<|im_start|>assistant\n"
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, images=None, **kwargs):
|
||||
skip_template = False
|
||||
if text.startswith("<|im_start|>"):
|
||||
skip_template = True
|
||||
|
|
@ -90,11 +94,14 @@ class LongCatImageTokenizer(sd1_clip.SD1Tokenizer):
|
|||
text, return_word_ids=return_word_ids, disable_weights=True, **kwargs
|
||||
)
|
||||
else:
|
||||
has_images = images is not None and len(images) > 0
|
||||
template_prefix = self.EDIT_PREFIX if has_images else self.T2I_PREFIX
|
||||
|
||||
prefix_ids = base_tok.tokenizer(
|
||||
self.longcat_template_prefix, add_special_tokens=False
|
||||
template_prefix, add_special_tokens=False
|
||||
)["input_ids"]
|
||||
suffix_ids = base_tok.tokenizer(
|
||||
self.longcat_template_suffix, add_special_tokens=False
|
||||
self.SUFFIX, add_special_tokens=False
|
||||
)["input_ids"]
|
||||
|
||||
prompt_tokens = base_tok.tokenize_with_weights(
|
||||
|
|
@ -106,6 +113,14 @@ class LongCatImageTokenizer(sd1_clip.SD1Tokenizer):
|
|||
suffix_pairs = [(t, 1.0) for t in suffix_ids]
|
||||
|
||||
combined = prefix_pairs + prompt_pairs + suffix_pairs
|
||||
|
||||
if has_images:
|
||||
embed_count = 0
|
||||
for i in range(len(combined)):
|
||||
if combined[i][0] == IMAGE_PAD_TOKEN_ID and embed_count < len(images):
|
||||
combined[i] = ({"type": "image", "data": images[embed_count], "original_type": "image"}, combined[i][1])
|
||||
embed_count += 1
|
||||
|
||||
tokens = {"qwen25_7b": [combined]}
|
||||
|
||||
return tokens
|
||||
|
|
|
|||
|
|
@ -91,11 +91,11 @@ class Gemma3_12BModel(sd1_clip.SDClipModel):
|
|||
self.dtypes.add(dtype)
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma3_12B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed):
|
||||
def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty):
|
||||
tokens_only = [[t[0] for t in b] for b in tokens]
|
||||
embeds, _, _, embeds_info = self.process_tokens(tokens_only, self.execution_device)
|
||||
comfy.utils.normalize_image_embeddings(embeds, embeds_info, self.transformer.model.config.hidden_size ** 0.5)
|
||||
return self.transformer.generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, stop_tokens=[106]) # 106 is <end_of_turn>
|
||||
return self.transformer.generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, stop_tokens=[106], presence_penalty=presence_penalty) # 106 is <end_of_turn>
|
||||
|
||||
class DualLinearProjection(torch.nn.Module):
|
||||
def __init__(self, in_dim, out_dim_video, out_dim_audio, dtype=None, device=None, operations=None):
|
||||
|
|
@ -189,8 +189,8 @@ class LTXAVTEModel(torch.nn.Module):
|
|||
|
||||
return out.to(device=out_device, dtype=torch.float), pooled, extra
|
||||
|
||||
def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed):
|
||||
return self.gemma3_12b.generate(tokens["gemma3_12b"], do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed)
|
||||
def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty):
|
||||
return self.gemma3_12b.generate(tokens["gemma3_12b"], do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty)
|
||||
|
||||
def load_sd(self, sd):
|
||||
if "model.layers.47.self_attn.q_norm.weight" in sd:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,833 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from dataclasses import dataclass, field
|
||||
import os
|
||||
import math
|
||||
|
||||
import comfy.model_management
|
||||
from comfy.ldm.modules.attention import optimized_attention_for_device
|
||||
from comfy import sd1_clip
|
||||
import comfy.text_encoders.qwen_vl
|
||||
|
||||
from .llama import BaseLlama, BaseGenerate, Llama2_, MLP, RMSNorm, apply_rope
|
||||
|
||||
|
||||
def _qwen35_layer_types(n):
|
||||
return [("full_attention" if (i + 1) % 4 == 0 else "linear_attention") for i in range(n)]
|
||||
|
||||
@dataclass
|
||||
class Qwen35Config:
|
||||
vocab_size: int = 248320
|
||||
hidden_size: int = 2048
|
||||
intermediate_size: int = 6144
|
||||
num_hidden_layers: int = 24
|
||||
# Full attention params
|
||||
num_attention_heads: int = 8
|
||||
num_key_value_heads: int = 2
|
||||
head_dim: int = 256
|
||||
partial_rotary_factor: float = 0.25
|
||||
# Linear attention (DeltaNet) params
|
||||
linear_num_key_heads: int = 16
|
||||
linear_num_value_heads: int = 16
|
||||
linear_key_head_dim: int = 128
|
||||
linear_value_head_dim: int = 128
|
||||
conv_kernel_size: int = 4
|
||||
# Shared params
|
||||
max_position_embeddings: int = 32768
|
||||
rms_norm_eps: float = 1e-6
|
||||
rope_theta: float = 10000000.0
|
||||
mrope_section: list = field(default_factory=lambda: [11, 11, 10])
|
||||
layer_types: list = field(default_factory=lambda: _qwen35_layer_types(24))
|
||||
rms_norm_add: bool = True
|
||||
mlp_activation: str = "silu"
|
||||
qkv_bias: bool = False
|
||||
final_norm: bool = True
|
||||
lm_head: bool = False
|
||||
stop_tokens: list = field(default_factory=lambda: [248044, 248046])
|
||||
# These are needed for BaseLlama/BaseGenerate compatibility but unused directly
|
||||
transformer_type: str = "qwen35_2b"
|
||||
rope_dims: list = None
|
||||
rope_scale: float = None
|
||||
|
||||
QWEN35_VISION_DEFAULTS = dict(hidden_size=1024, num_heads=16, intermediate_size=4096, depth=24, patch_size=16, temporal_patch_size=2, in_channels=3, spatial_merge_size=2, num_position_embeddings=2304)
|
||||
|
||||
QWEN35_MODELS = {
|
||||
"qwen35_08b": dict(hidden_size=1024, intermediate_size=3584, vision=dict(hidden_size=768, num_heads=12, intermediate_size=3072, depth=12)),
|
||||
"qwen35_2b": dict(hidden_size=2048, intermediate_size=6144, num_hidden_layers=24, num_attention_heads=8, num_key_value_heads=2, linear_num_value_heads=16),
|
||||
"qwen35_4b": dict(hidden_size=2560, intermediate_size=9216, num_hidden_layers=32, num_attention_heads=16, num_key_value_heads=4, linear_num_value_heads=32),
|
||||
"qwen35_9b": dict(hidden_size=4096, intermediate_size=12288, num_hidden_layers=32, num_attention_heads=16, num_key_value_heads=4, linear_num_value_heads=32, lm_head=True, vision=dict(hidden_size=1152, intermediate_size=4304, depth=27)),
|
||||
"qwen35_27b": dict(hidden_size=5120, intermediate_size=17408, num_hidden_layers=64, num_attention_heads=24, num_key_value_heads=4, linear_num_value_heads=48, lm_head=True, vision=dict(hidden_size=1152, intermediate_size=4304, depth=27)),
|
||||
}
|
||||
|
||||
|
||||
def _make_config(model_type, config_dict={}):
|
||||
overrides = QWEN35_MODELS.get(model_type, {}).copy()
|
||||
overrides.pop("vision", None)
|
||||
if "num_hidden_layers" in overrides:
|
||||
overrides["layer_types"] = _qwen35_layer_types(overrides["num_hidden_layers"])
|
||||
overrides.update(config_dict)
|
||||
return Qwen35Config(**overrides)
|
||||
|
||||
|
||||
class RMSNormGated(RMSNorm):
|
||||
def forward(self, x, gate):
|
||||
return super().forward(x) * F.silu(gate.to(x.dtype))
|
||||
|
||||
def torch_chunk_gated_delta_rule(query, key, value, g, beta, chunk_size=64, initial_state=None, output_final_state=False):
|
||||
initial_dtype = query.dtype
|
||||
query = F.normalize(query, dim=-1)
|
||||
key = F.normalize(key, dim=-1)
|
||||
query, key, value, beta, g = [x.transpose(1, 2).contiguous().to(torch.float32) for x in (query, key, value, beta, g)]
|
||||
|
||||
batch_size, num_heads, sequence_length, k_head_dim = key.shape
|
||||
v_head_dim = value.shape[-1]
|
||||
pad_size = (chunk_size - sequence_length % chunk_size) % chunk_size
|
||||
query = F.pad(query, (0, 0, 0, pad_size))
|
||||
key = F.pad(key, (0, 0, 0, pad_size))
|
||||
value = F.pad(value, (0, 0, 0, pad_size))
|
||||
beta = F.pad(beta, (0, pad_size))
|
||||
g = F.pad(g, (0, pad_size))
|
||||
total_sequence_length = sequence_length + pad_size
|
||||
scale = 1 / (query.shape[-1] ** 0.5)
|
||||
query = query * scale
|
||||
|
||||
v_beta = value * beta.unsqueeze(-1)
|
||||
k_beta = key * beta.unsqueeze(-1)
|
||||
query, key, value, k_beta, v_beta = [x.reshape(x.shape[0], x.shape[1], -1, chunk_size, x.shape[-1]) for x in (query, key, value, k_beta, v_beta)]
|
||||
g = g.reshape(g.shape[0], g.shape[1], -1, chunk_size)
|
||||
mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=0)
|
||||
|
||||
g = g.cumsum(dim=-1)
|
||||
decay_mask = ((g.unsqueeze(-1) - g.unsqueeze(-2)).tril().exp().float()).tril()
|
||||
attn = -((k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0)
|
||||
for i in range(1, chunk_size):
|
||||
row = attn[..., i, :i].clone()
|
||||
sub = attn[..., :i, :i].clone()
|
||||
attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
|
||||
attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
|
||||
value = attn @ v_beta
|
||||
k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1))
|
||||
last_recurrent_state = (
|
||||
torch.zeros(batch_size, num_heads, k_head_dim, v_head_dim).to(value)
|
||||
if initial_state is None
|
||||
else initial_state.to(value)
|
||||
)
|
||||
core_attn_out = torch.zeros_like(value)
|
||||
mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=1)
|
||||
|
||||
for i in range(0, total_sequence_length // chunk_size):
|
||||
q_i, k_i, v_i = query[:, :, i], key[:, :, i], value[:, :, i]
|
||||
attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
|
||||
v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
|
||||
v_new = v_i - v_prime
|
||||
attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
|
||||
core_attn_out[:, :, i] = attn_inter + attn @ v_new
|
||||
last_recurrent_state = (
|
||||
last_recurrent_state * g[:, :, i, -1, None, None].exp()
|
||||
+ (k_i * (g[:, :, i, -1, None] - g[:, :, i]).exp()[..., None]).transpose(-1, -2) @ v_new
|
||||
)
|
||||
|
||||
if not output_final_state:
|
||||
last_recurrent_state = None
|
||||
core_attn_out = core_attn_out.reshape(core_attn_out.shape[0], core_attn_out.shape[1], -1, core_attn_out.shape[-1])
|
||||
core_attn_out = core_attn_out[:, :, :sequence_length]
|
||||
core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
|
||||
return core_attn_out, last_recurrent_state
|
||||
|
||||
|
||||
def torch_causal_conv1d_update(x, conv_state, weight, bias=None):
|
||||
# conv_state: [B, channels, kernel_size-1], x: [B, channels, 1]
|
||||
# weight: [channels, kernel_size]
|
||||
state_len = conv_state.shape[-1]
|
||||
combined = torch.cat([conv_state, x], dim=-1).to(weight.dtype) # [B, channels, kernel_size]
|
||||
conv_state.copy_(combined[:, :, -state_len:])
|
||||
out = (combined * weight).sum(dim=-1, keepdim=True) # [B, channels, 1]
|
||||
if bias is not None:
|
||||
out = out + bias.unsqueeze(0).unsqueeze(-1)
|
||||
return F.silu(out).to(x.dtype)
|
||||
|
||||
|
||||
# GatedDeltaNet - Linear Attention Layer
|
||||
|
||||
class GatedDeltaNet(nn.Module):
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
|
||||
hidden = config.hidden_size
|
||||
self.num_key_heads = config.linear_num_key_heads
|
||||
self.num_value_heads = config.linear_num_value_heads
|
||||
self.key_head_dim = config.linear_key_head_dim
|
||||
self.value_head_dim = config.linear_value_head_dim
|
||||
self.conv_kernel_size = config.conv_kernel_size
|
||||
|
||||
key_dim = self.num_key_heads * self.key_head_dim
|
||||
value_dim = self.num_value_heads * self.value_head_dim
|
||||
self.key_dim = key_dim
|
||||
self.value_dim = value_dim
|
||||
conv_dim = key_dim * 2 + value_dim
|
||||
|
||||
self.in_proj_qkv = ops.Linear(hidden, conv_dim, bias=False, device=device, dtype=dtype)
|
||||
self.in_proj_z = ops.Linear(hidden, value_dim, bias=False, device=device, dtype=dtype)
|
||||
self.in_proj_b = ops.Linear(hidden, self.num_value_heads, bias=False, device=device, dtype=dtype)
|
||||
self.in_proj_a = ops.Linear(hidden, self.num_value_heads, bias=False, device=device, dtype=dtype)
|
||||
self.out_proj = ops.Linear(value_dim, hidden, bias=False, device=device, dtype=dtype)
|
||||
|
||||
self.dt_bias = nn.Parameter(torch.empty(self.num_value_heads, device=device, dtype=dtype))
|
||||
self.A_log = nn.Parameter(torch.empty(self.num_value_heads, device=device, dtype=dtype))
|
||||
|
||||
self.conv1d = ops.Conv1d(in_channels=conv_dim, out_channels=conv_dim, bias=False, kernel_size=self.conv_kernel_size,
|
||||
groups=conv_dim, padding=self.conv_kernel_size - 1, device=device, dtype=dtype)
|
||||
|
||||
self.norm = RMSNormGated(self.value_head_dim, eps=config.rms_norm_eps, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, past_key_value=None, **kwargs):
|
||||
batch_size, seq_len, _ = x.shape
|
||||
|
||||
use_recurrent = (
|
||||
past_key_value is not None
|
||||
and past_key_value[2] > 0
|
||||
and seq_len == 1
|
||||
)
|
||||
|
||||
# Projections (shared)
|
||||
mixed_qkv = self.in_proj_qkv(x).transpose(1, 2) # [B, conv_dim, seq_len]
|
||||
z = self.in_proj_z(x)
|
||||
b = self.in_proj_b(x)
|
||||
a = self.in_proj_a(x)
|
||||
|
||||
# Conv1d
|
||||
if use_recurrent:
|
||||
recurrent_state, conv_state, step_index = past_key_value
|
||||
conv_weight = comfy.model_management.cast_to_device(self.conv1d.weight, mixed_qkv.device, mixed_qkv.dtype).squeeze(1)
|
||||
conv_bias = comfy.model_management.cast_to_device(self.conv1d.bias, mixed_qkv.device, mixed_qkv.dtype) if self.conv1d.bias is not None else None
|
||||
mixed_qkv = torch_causal_conv1d_update(mixed_qkv, conv_state, conv_weight, conv_bias)
|
||||
else:
|
||||
if past_key_value is not None:
|
||||
recurrent_state, conv_state, step_index = past_key_value
|
||||
conv_state_init = F.pad(mixed_qkv, (self.conv_kernel_size - mixed_qkv.shape[-1], 0))
|
||||
conv_state.copy_(conv_state_init[:, :, -conv_state.shape[-1]:])
|
||||
mixed_qkv = F.silu(self.conv1d(mixed_qkv)[:, :, :seq_len])
|
||||
|
||||
# Split QKV and compute beta/g
|
||||
mixed_qkv = mixed_qkv.transpose(1, 2) # [B, seq_len, conv_dim]
|
||||
query, key, value = mixed_qkv.split([self.key_dim, self.key_dim, self.value_dim], dim=-1)
|
||||
beta = b.sigmoid()
|
||||
g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias.float())
|
||||
|
||||
# Delta rule
|
||||
if use_recurrent:
|
||||
# single-token path: work in [B, heads, dim] without seq dim
|
||||
query = query.reshape(batch_size, self.num_key_heads, self.key_head_dim)
|
||||
key = key.reshape(batch_size, self.num_key_heads, self.key_head_dim)
|
||||
value = value.reshape(batch_size, self.num_value_heads, self.value_head_dim)
|
||||
|
||||
if self.num_value_heads != self.num_key_heads:
|
||||
rep = self.num_value_heads // self.num_key_heads
|
||||
query = query.repeat_interleave(rep, dim=1)
|
||||
key = key.repeat_interleave(rep, dim=1)
|
||||
|
||||
scale = self.key_head_dim ** -0.5
|
||||
q = F.normalize(query.float(), dim=-1) * scale
|
||||
k = F.normalize(key.float(), dim=-1)
|
||||
v = value.float()
|
||||
beta_t = beta.reshape(batch_size, -1)
|
||||
g_t = g.reshape(batch_size, -1).exp()
|
||||
|
||||
# In-place state update: [B, heads, k_dim, v_dim]
|
||||
recurrent_state.mul_(g_t[:, :, None, None])
|
||||
kv_mem = torch.einsum('bhk,bhkv->bhv', k, recurrent_state)
|
||||
delta = (v - kv_mem) * beta_t[:, :, None]
|
||||
recurrent_state.add_(k.unsqueeze(-1) * delta.unsqueeze(-2))
|
||||
core_attn_out = torch.einsum('bhk,bhkv->bhv', q, recurrent_state)
|
||||
|
||||
core_attn_out = core_attn_out.to(x.dtype).unsqueeze(1)
|
||||
present_key_value = (recurrent_state, conv_state, step_index + 1)
|
||||
else:
|
||||
query = query.reshape(batch_size, seq_len, -1, self.key_head_dim)
|
||||
key = key.reshape(batch_size, seq_len, -1, self.key_head_dim)
|
||||
value = value.reshape(batch_size, seq_len, -1, self.value_head_dim)
|
||||
|
||||
if self.num_value_heads != self.num_key_heads:
|
||||
rep = self.num_value_heads // self.num_key_heads
|
||||
query = query.repeat_interleave(rep, dim=2)
|
||||
key = key.repeat_interleave(rep, dim=2)
|
||||
|
||||
core_attn_out, last_recurrent_state = torch_chunk_gated_delta_rule(
|
||||
query, key, value, g=g, beta=beta,
|
||||
initial_state=None,
|
||||
output_final_state=past_key_value is not None,
|
||||
)
|
||||
|
||||
present_key_value = None
|
||||
if past_key_value is not None:
|
||||
if last_recurrent_state is not None:
|
||||
recurrent_state.copy_(last_recurrent_state.to(recurrent_state.dtype))
|
||||
present_key_value = (recurrent_state, conv_state, step_index + seq_len)
|
||||
|
||||
# Gated norm + output projection (shared)
|
||||
core_attn_out = self.norm(core_attn_out.reshape(-1, self.value_head_dim), z.reshape(-1, self.value_head_dim))
|
||||
output = self.out_proj(core_attn_out.reshape(batch_size, seq_len, -1))
|
||||
return output, present_key_value
|
||||
|
||||
|
||||
# GatedAttention - Full Attention with output gating
|
||||
def precompute_partial_rope(head_dim, rotary_dim, position_ids, theta, device=None, mrope_section=None):
|
||||
"""Compute RoPE frequencies for partial rotary embeddings."""
|
||||
theta_numerator = torch.arange(0, rotary_dim, 2, device=device).float()
|
||||
inv_freq = 1.0 / (theta ** (theta_numerator / rotary_dim))
|
||||
|
||||
inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
|
||||
position_ids_expanded = position_ids[:, None, :].float()
|
||||
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
||||
emb = torch.cat((freqs, freqs), dim=-1)
|
||||
cos = emb.cos()
|
||||
sin = emb.sin()
|
||||
|
||||
if mrope_section is not None and position_ids.shape[0] == 3:
|
||||
mrope_section_2 = [s * 2 for s in mrope_section]
|
||||
cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section_2, dim=-1))], dim=-1).unsqueeze(0)
|
||||
sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section_2, dim=-1))], dim=-1).unsqueeze(0)
|
||||
|
||||
cos = cos.unsqueeze(1)
|
||||
sin = sin.unsqueeze(1)
|
||||
sin_split = sin.shape[-1] // 2
|
||||
return (cos, sin[..., :sin_split], -sin[..., sin_split:])
|
||||
|
||||
|
||||
def apply_partial_rope(xq, xk, freqs_cis, rotary_dim):
|
||||
"""Apply RoPE to only the first rotary_dim dimensions."""
|
||||
xq_rot = xq[..., :rotary_dim]
|
||||
xq_pass = xq[..., rotary_dim:]
|
||||
xk_rot = xk[..., :rotary_dim]
|
||||
xk_pass = xk[..., rotary_dim:]
|
||||
|
||||
xq_rot, xk_rot = apply_rope(xq_rot, xk_rot, freqs_cis)
|
||||
|
||||
xq = torch.cat([xq_rot, xq_pass], dim=-1)
|
||||
xk = torch.cat([xk_rot, xk_pass], dim=-1)
|
||||
return xq, xk
|
||||
|
||||
|
||||
class GatedAttention(nn.Module):
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
|
||||
self.num_heads = config.num_attention_heads
|
||||
self.num_kv_heads = config.num_key_value_heads
|
||||
self.head_dim = config.head_dim
|
||||
self.hidden_size = config.hidden_size
|
||||
self.inner_size = self.num_heads * self.head_dim
|
||||
self.rotary_dim = int(self.head_dim * config.partial_rotary_factor)
|
||||
|
||||
# q_proj outputs 2x: query + gate
|
||||
self.q_proj = ops.Linear(config.hidden_size, self.inner_size * 2, bias=config.qkv_bias, device=device, dtype=dtype)
|
||||
self.k_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=config.qkv_bias, device=device, dtype=dtype)
|
||||
self.v_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=config.qkv_bias, device=device, dtype=dtype)
|
||||
self.o_proj = ops.Linear(self.inner_size, config.hidden_size, bias=False, device=device, dtype=dtype)
|
||||
|
||||
# QK norms with (1+weight) scaling
|
||||
self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, attention_mask=None, freqs_cis=None, optimized_attention=None, past_key_value=None):
|
||||
batch_size, seq_length, _ = x.shape
|
||||
|
||||
# Project Q (with gate), K, V
|
||||
qg = self.q_proj(x)
|
||||
# Split into query and gate: each is [B, seq, inner_size]
|
||||
qg = qg.view(batch_size, seq_length, self.num_heads, self.head_dim * 2)
|
||||
xq, gate = qg[..., :self.head_dim], qg[..., self.head_dim:]
|
||||
gate = gate.reshape(batch_size, seq_length, -1) # [B, seq, inner_size]
|
||||
|
||||
xk = self.k_proj(x)
|
||||
xv = self.v_proj(x)
|
||||
|
||||
xq = self.q_norm(xq).transpose(1, 2) # [B, heads, seq, head_dim]
|
||||
xk = self.k_norm(xk.view(batch_size, seq_length, self.num_kv_heads, self.head_dim)).transpose(1, 2)
|
||||
xv = xv.view(batch_size, seq_length, self.num_kv_heads, self.head_dim).transpose(1, 2)
|
||||
|
||||
# Apply partial RoPE
|
||||
xq, xk = apply_partial_rope(xq, xk, freqs_cis, self.rotary_dim)
|
||||
|
||||
# KV cache
|
||||
present_key_value = None
|
||||
if past_key_value is not None:
|
||||
past_key, past_value, index = past_key_value
|
||||
num_tokens = xk.shape[2]
|
||||
if past_key.shape[2] >= (index + num_tokens):
|
||||
past_key[:, :, index:index + num_tokens] = xk
|
||||
past_value[:, :, index:index + num_tokens] = xv
|
||||
xk = past_key[:, :, :index + num_tokens]
|
||||
xv = past_value[:, :, :index + num_tokens]
|
||||
present_key_value = (past_key, past_value, index + num_tokens)
|
||||
else:
|
||||
if index > 0:
|
||||
xk = torch.cat((past_key[:, :, :index], xk), dim=2)
|
||||
xv = torch.cat((past_value[:, :, :index], xv), dim=2)
|
||||
present_key_value = (xk, xv, index + num_tokens)
|
||||
|
||||
# Expand KV heads for GQA
|
||||
if self.num_heads != self.num_kv_heads:
|
||||
xk = xk.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
|
||||
xv = xv.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
|
||||
|
||||
output = optimized_attention(xq, xk, xv, self.num_heads, mask=attention_mask, skip_reshape=True)
|
||||
output = output * gate.sigmoid()
|
||||
|
||||
return self.o_proj(output), present_key_value
|
||||
|
||||
|
||||
# Hybrid Transformer Block
|
||||
class Qwen35TransformerBlock(nn.Module):
|
||||
def __init__(self, config, index, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.layer_type = config.layer_types[index]
|
||||
if self.layer_type == "linear_attention":
|
||||
self.linear_attn = GatedDeltaNet(config, device=device, dtype=dtype, ops=ops)
|
||||
else:
|
||||
self.self_attn = GatedAttention(config, device=device, dtype=dtype, ops=ops)
|
||||
self.mlp = MLP(config, device=device, dtype=dtype, ops=ops)
|
||||
self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, attention_mask=None, freqs_cis=None, optimized_attention=None, past_key_value=None):
|
||||
if self.layer_type == "linear_attention":
|
||||
h, present_key_value = self.linear_attn(self.input_layernorm(x), attention_mask=attention_mask, past_key_value=past_key_value)
|
||||
else:
|
||||
h, present_key_value = self.self_attn(self.input_layernorm(x), attention_mask=attention_mask, freqs_cis=freqs_cis, optimized_attention=optimized_attention, past_key_value=past_key_value)
|
||||
|
||||
x = x + h
|
||||
x = x + self.mlp(self.post_attention_layernorm(x))
|
||||
return x, present_key_value
|
||||
|
||||
|
||||
# Qwen35 Transformer Backbone
|
||||
class Qwen35Transformer(Llama2_):
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
nn.Module.__init__(self)
|
||||
self.config = config
|
||||
self.vocab_size = config.vocab_size
|
||||
self.normalize_in = False
|
||||
|
||||
self.embed_tokens = ops.Embedding(config.vocab_size, config.hidden_size, device=device, dtype=dtype)
|
||||
self.layers = nn.ModuleList([
|
||||
Qwen35TransformerBlock(config, index=i, device=device, dtype=dtype, ops=ops)
|
||||
for i in range(config.num_hidden_layers)
|
||||
])
|
||||
|
||||
if config.final_norm:
|
||||
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
else:
|
||||
self.norm = None
|
||||
|
||||
if config.lm_head:
|
||||
self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)
|
||||
|
||||
def get_past_len(self, past_key_values):
|
||||
for i, layer in enumerate(self.layers):
|
||||
if layer.layer_type == "full_attention":
|
||||
if len(past_key_values) > i:
|
||||
return past_key_values[i][2]
|
||||
break
|
||||
return 0
|
||||
|
||||
def compute_freqs_cis(self, position_ids, device):
|
||||
rotary_dim = int(self.config.head_dim * self.config.partial_rotary_factor)
|
||||
return precompute_partial_rope(
|
||||
self.config.head_dim, rotary_dim, position_ids,
|
||||
self.config.rope_theta, device=device,
|
||||
mrope_section=self.config.mrope_section,
|
||||
)
|
||||
|
||||
|
||||
# Vision Encoder
|
||||
class Qwen35VisionPatchEmbed(nn.Module):
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.patch_size = config["patch_size"]
|
||||
self.temporal_patch_size = config["temporal_patch_size"]
|
||||
self.in_channels = config["in_channels"]
|
||||
self.embed_dim = config["hidden_size"]
|
||||
kernel_size = [self.temporal_patch_size, self.patch_size, self.patch_size]
|
||||
self.proj = ops.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
target_dtype = self.proj.weight.dtype
|
||||
x = x.view(-1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size)
|
||||
return self.proj(x.to(target_dtype)).view(-1, self.embed_dim)
|
||||
|
||||
|
||||
class Qwen35VisionMLP(nn.Module):
|
||||
def __init__(self, hidden_size, intermediate_size, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
|
||||
self.linear_fc1 = ops.Linear(hidden_size, intermediate_size, bias=True, device=device, dtype=dtype)
|
||||
self.linear_fc2 = ops.Linear(intermediate_size, hidden_size, bias=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, hidden_state):
|
||||
return self.linear_fc2(F.gelu(self.linear_fc1(hidden_state), approximate="tanh"))
|
||||
|
||||
|
||||
class Qwen35VisionRotaryEmbedding(nn.Module):
|
||||
def __init__(self, dim, theta=10000.0):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
|
||||
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
||||
|
||||
def forward(self, seqlen):
|
||||
seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
|
||||
freqs = torch.outer(seq, self.inv_freq)
|
||||
return freqs
|
||||
|
||||
|
||||
class Qwen35VisionAttention(nn.Module):
|
||||
def __init__(self, hidden_size, num_heads, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
|
||||
self.dim = hidden_size
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = self.dim // self.num_heads
|
||||
self.qkv = ops.Linear(self.dim, self.dim * 3, bias=True, device=device, dtype=dtype)
|
||||
self.proj = ops.Linear(self.dim, self.dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, cu_seqlens, position_embeddings, optimized_attention=None):
|
||||
seq_length = x.shape[0]
|
||||
query_states, key_states, value_states = (
|
||||
self.qkv(x).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
|
||||
)
|
||||
query_states, key_states = apply_rope(query_states, key_states, position_embeddings)
|
||||
|
||||
# Process per-sequence attention
|
||||
lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
|
||||
q_splits = torch.split(query_states, lengths, dim=0)
|
||||
k_splits = torch.split(key_states, lengths, dim=0)
|
||||
v_splits = torch.split(value_states, lengths, dim=0)
|
||||
|
||||
attn_outputs = []
|
||||
for q, k, v in zip(q_splits, k_splits, v_splits):
|
||||
q = q.transpose(0, 1).unsqueeze(0)
|
||||
k = k.transpose(0, 1).unsqueeze(0)
|
||||
v = v.transpose(0, 1).unsqueeze(0)
|
||||
attn_outputs.append(optimized_attention(q, k, v, self.num_heads, skip_reshape=True))
|
||||
|
||||
attn_output = torch.cat(attn_outputs, dim=1)
|
||||
attn_output = attn_output.reshape(seq_length, -1)
|
||||
return self.proj(attn_output)
|
||||
|
||||
|
||||
class Qwen35VisionBlock(nn.Module):
|
||||
def __init__(self, hidden_size, num_heads, intermediate_size, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
|
||||
self.norm1 = ops.LayerNorm(hidden_size, eps=1e-6, device=device, dtype=dtype)
|
||||
self.norm2 = ops.LayerNorm(hidden_size, eps=1e-6, device=device, dtype=dtype)
|
||||
self.attn = Qwen35VisionAttention(hidden_size, num_heads, device=device, dtype=dtype, ops=ops)
|
||||
self.mlp = Qwen35VisionMLP(hidden_size, intermediate_size, device=device, dtype=dtype, ops=ops)
|
||||
|
||||
def forward(self, x, cu_seqlens, position_embeddings, optimized_attention=None):
|
||||
x = x + self.attn(self.norm1(x), cu_seqlens=cu_seqlens, position_embeddings=position_embeddings, optimized_attention=optimized_attention)
|
||||
return x + self.mlp(self.norm2(x))
|
||||
|
||||
|
||||
class Qwen35VisionPatchMerger(nn.Module):
|
||||
def __init__(self, hidden_size, spatial_merge_size, out_hidden_size, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
|
||||
merge_dim = hidden_size * (spatial_merge_size ** 2)
|
||||
self.norm = ops.LayerNorm(hidden_size, eps=1e-6, device=device, dtype=dtype)
|
||||
self.linear_fc1 = ops.Linear(merge_dim, merge_dim, device=device, dtype=dtype)
|
||||
self.linear_fc2 = ops.Linear(merge_dim, out_hidden_size, device=device, dtype=dtype)
|
||||
self.merge_dim = merge_dim
|
||||
|
||||
def forward(self, x):
|
||||
x = self.norm(x).view(-1, self.merge_dim)
|
||||
return self.linear_fc2(F.gelu(self.linear_fc1(x)))
|
||||
|
||||
|
||||
class Qwen35VisionModel(nn.Module):
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.spatial_merge_size = config["spatial_merge_size"]
|
||||
self.patch_size = config["patch_size"]
|
||||
self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
|
||||
|
||||
self.hidden_size = config["hidden_size"]
|
||||
self.num_heads = config["num_heads"]
|
||||
self.num_position_embeddings = config["num_position_embeddings"]
|
||||
|
||||
self.patch_embed = Qwen35VisionPatchEmbed(config, device=device, dtype=dtype, ops=ops)
|
||||
self.pos_embed = ops.Embedding(self.num_position_embeddings, self.hidden_size, device=device, dtype=dtype)
|
||||
self.num_grid_per_side = int(self.num_position_embeddings ** 0.5)
|
||||
self.rotary_pos_emb = Qwen35VisionRotaryEmbedding(self.hidden_size // self.num_heads // 2)
|
||||
self.blocks = nn.ModuleList([
|
||||
Qwen35VisionBlock(self.hidden_size, self.num_heads, config["intermediate_size"], device=device, dtype=dtype, ops=ops)
|
||||
for _ in range(config["depth"])
|
||||
])
|
||||
self.merger = Qwen35VisionPatchMerger(self.hidden_size, self.spatial_merge_size, config["out_hidden_size"], device=device, dtype=dtype, ops=ops)
|
||||
|
||||
def rot_pos_emb(self, grid_thw):
|
||||
merge_size = self.spatial_merge_size
|
||||
grid_thw_list = grid_thw.tolist()
|
||||
max_hw = max(max(h, w) for _, h, w in grid_thw_list)
|
||||
freq_table = self.rotary_pos_emb(max_hw)
|
||||
device = freq_table.device
|
||||
total_tokens = sum(int(t * h * w) for t, h, w in grid_thw_list)
|
||||
pos_ids = torch.empty((total_tokens, 2), dtype=torch.long, device=device)
|
||||
offset = 0
|
||||
for num_frames, height, width in grid_thw_list:
|
||||
num_frames, height, width = int(num_frames), int(height), int(width)
|
||||
merged_h, merged_w = height // merge_size, width // merge_size
|
||||
block_rows = torch.arange(merged_h, device=device)
|
||||
block_cols = torch.arange(merged_w, device=device)
|
||||
intra_row = torch.arange(merge_size, device=device)
|
||||
intra_col = torch.arange(merge_size, device=device)
|
||||
row_idx = block_rows[:, None, None, None] * merge_size + intra_row[None, None, :, None]
|
||||
col_idx = block_cols[None, :, None, None] * merge_size + intra_col[None, None, None, :]
|
||||
row_idx = row_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1)
|
||||
col_idx = col_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1)
|
||||
coords = torch.stack((row_idx, col_idx), dim=-1)
|
||||
if num_frames > 1:
|
||||
coords = coords.repeat(num_frames, 1)
|
||||
num_tokens = coords.shape[0]
|
||||
pos_ids[offset:offset + num_tokens] = coords
|
||||
offset += num_tokens
|
||||
embeddings = freq_table[pos_ids]
|
||||
embeddings = embeddings.flatten(1)
|
||||
return embeddings
|
||||
|
||||
def fast_pos_embed_interpolate(self, grid_thw):
|
||||
grid_thw_list = grid_thw.tolist()
|
||||
grid_ts = [int(row[0]) for row in grid_thw_list]
|
||||
grid_hs = [int(row[1]) for row in grid_thw_list]
|
||||
grid_ws = [int(row[2]) for row in grid_thw_list]
|
||||
device = self.pos_embed.weight.device
|
||||
idx_list = [[] for _ in range(4)]
|
||||
weight_list = [[] for _ in range(4)]
|
||||
for t, h, w in grid_thw_list:
|
||||
h, w = int(h), int(w)
|
||||
h_idxs = torch.linspace(0, self.num_grid_per_side - 1, h, device=device)
|
||||
w_idxs = torch.linspace(0, self.num_grid_per_side - 1, w, device=device)
|
||||
h_idxs_floor = h_idxs.int()
|
||||
w_idxs_floor = w_idxs.int()
|
||||
h_idxs_ceil = (h_idxs.int() + 1).clip(max=self.num_grid_per_side - 1)
|
||||
w_idxs_ceil = (w_idxs.int() + 1).clip(max=self.num_grid_per_side - 1)
|
||||
dh = h_idxs - h_idxs_floor
|
||||
dw = w_idxs - w_idxs_floor
|
||||
base_h = h_idxs_floor * self.num_grid_per_side
|
||||
base_h_ceil = h_idxs_ceil * self.num_grid_per_side
|
||||
indices = [
|
||||
(base_h[None].T + w_idxs_floor[None]).flatten(),
|
||||
(base_h[None].T + w_idxs_ceil[None]).flatten(),
|
||||
(base_h_ceil[None].T + w_idxs_floor[None]).flatten(),
|
||||
(base_h_ceil[None].T + w_idxs_ceil[None]).flatten(),
|
||||
]
|
||||
weights = [
|
||||
((1 - dh)[None].T * (1 - dw)[None]).flatten(),
|
||||
((1 - dh)[None].T * dw[None]).flatten(),
|
||||
(dh[None].T * (1 - dw)[None]).flatten(),
|
||||
(dh[None].T * dw[None]).flatten(),
|
||||
]
|
||||
for j in range(4):
|
||||
idx_list[j].extend(indices[j].tolist())
|
||||
weight_list[j].extend(weights[j].tolist())
|
||||
idx_tensor = torch.tensor(idx_list, dtype=torch.long, device=device)
|
||||
weight_tensor = torch.tensor(weight_list, dtype=self.pos_embed.weight.dtype, device=device)
|
||||
pos_embeds = self.pos_embed(idx_tensor).to(device) * weight_tensor[:, :, None]
|
||||
patch_pos_embeds = pos_embeds[0] + pos_embeds[1] + pos_embeds[2] + pos_embeds[3]
|
||||
patch_pos_embeds = patch_pos_embeds.split([h * w for h, w in zip(grid_hs, grid_ws)])
|
||||
patch_pos_embeds_permute = []
|
||||
merge_size = self.spatial_merge_size
|
||||
for pos_embed, t, h, w in zip(patch_pos_embeds, grid_ts, grid_hs, grid_ws):
|
||||
pos_embed = pos_embed.repeat(t, 1)
|
||||
pos_embed = (
|
||||
pos_embed.view(t, h // merge_size, merge_size, w // merge_size, merge_size, -1)
|
||||
.permute(0, 1, 3, 2, 4, 5)
|
||||
.flatten(0, 4)
|
||||
)
|
||||
patch_pos_embeds_permute.append(pos_embed)
|
||||
return torch.cat(patch_pos_embeds_permute)
|
||||
|
||||
def forward(self, x, grid_thw):
|
||||
x = self.patch_embed(x)
|
||||
pos_embeds = self.fast_pos_embed_interpolate(grid_thw).to(x.device)
|
||||
x = x + pos_embeds
|
||||
rotary_pos_emb = self.rot_pos_emb(grid_thw)
|
||||
seq_len = x.shape[0]
|
||||
x = x.reshape(seq_len, -1)
|
||||
rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
|
||||
emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
|
||||
cos = emb.cos().unsqueeze(-2)
|
||||
sin = emb.sin().unsqueeze(-2)
|
||||
sin_half = sin.shape[-1] // 2
|
||||
position_embeddings = (cos, sin[..., :sin_half], -sin[..., sin_half:])
|
||||
cu_seqlens = torch.repeat_interleave(
|
||||
grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
|
||||
).cumsum(dim=0, dtype=torch.int32)
|
||||
cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
|
||||
optimized_attention = optimized_attention_for_device(x.device, mask=False, small_input=True)
|
||||
for blk in self.blocks:
|
||||
x = blk(x, cu_seqlens=cu_seqlens, position_embeddings=position_embeddings, optimized_attention=optimized_attention)
|
||||
merged = self.merger(x)
|
||||
return merged
|
||||
|
||||
# Model Wrapper
|
||||
class Qwen35(BaseLlama, BaseGenerate, torch.nn.Module):
|
||||
model_type = "qwen35_2b"
|
||||
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
super().__init__()
|
||||
config = _make_config(self.model_type, config_dict)
|
||||
self.num_layers = config.num_hidden_layers
|
||||
self.model = Qwen35Transformer(config, device=device, dtype=dtype, ops=operations)
|
||||
vision_overrides = QWEN35_MODELS.get(self.model_type, {}).get("vision", {})
|
||||
vision_config = {**QWEN35_VISION_DEFAULTS, **vision_overrides, "out_hidden_size": config.hidden_size}
|
||||
self.visual = Qwen35VisionModel(vision_config, device=device, dtype=dtype, ops=operations)
|
||||
self.dtype = dtype
|
||||
|
||||
def preprocess_embed(self, embed, device):
|
||||
if embed["type"] == "image":
|
||||
image, grid = comfy.text_encoders.qwen_vl.process_qwen2vl_images(embed["data"], patch_size=16)
|
||||
return self.visual(image.to(device, dtype=torch.float32), grid), grid
|
||||
return None, None
|
||||
|
||||
def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[], past_key_values=None):
|
||||
grid = None
|
||||
position_ids = None
|
||||
offset = 0
|
||||
for e in embeds_info:
|
||||
if e.get("type") == "image":
|
||||
grid = e.get("extra", None)
|
||||
start = e.get("index")
|
||||
if position_ids is None:
|
||||
position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device)
|
||||
position_ids[:, :start] = torch.arange(0, start, device=embeds.device)
|
||||
end = e.get("size") + start
|
||||
len_max = int(grid.max()) // 2
|
||||
start_next = len_max + start
|
||||
position_ids[:, end:] = torch.arange(start_next + offset, start_next + (embeds.shape[1] - end) + offset, device=embeds.device)
|
||||
position_ids[0, start:end] = start + offset
|
||||
max_d = int(grid[0][1]) // 2
|
||||
position_ids[1, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
|
||||
max_d = int(grid[0][2]) // 2
|
||||
position_ids[2, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start]
|
||||
offset += len_max - (end - start)
|
||||
|
||||
if grid is None:
|
||||
position_ids = None
|
||||
|
||||
return super().forward(x, attention_mask=attention_mask, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=final_layer_norm_intermediate, dtype=dtype, position_ids=position_ids, past_key_values=past_key_values)
|
||||
|
||||
def init_kv_cache(self, batch, max_cache_len, device, execution_dtype):
|
||||
model_config = self.model.config
|
||||
past_key_values = []
|
||||
for i in range(model_config.num_hidden_layers):
|
||||
if model_config.layer_types[i] == "linear_attention":
|
||||
recurrent_state = torch.zeros(
|
||||
[batch, model_config.linear_num_value_heads, model_config.linear_key_head_dim, model_config.linear_value_head_dim],
|
||||
device=device, dtype=torch.float32
|
||||
)
|
||||
conv_dim = model_config.linear_num_key_heads * model_config.linear_key_head_dim * 2 + model_config.linear_num_value_heads * model_config.linear_value_head_dim
|
||||
conv_state = torch.zeros(
|
||||
[batch, conv_dim, model_config.conv_kernel_size - 1],
|
||||
device=device, dtype=execution_dtype
|
||||
)
|
||||
past_key_values.append((recurrent_state, conv_state, 0))
|
||||
else:
|
||||
past_key_values.append((
|
||||
torch.empty([batch, model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype),
|
||||
torch.empty([batch, model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype),
|
||||
0
|
||||
))
|
||||
return past_key_values
|
||||
|
||||
# Tokenizer and Text Encoder Wrappers
|
||||
|
||||
class Qwen35Tokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}, embedding_size=2048, embedding_key="qwen35_2b"):
|
||||
from transformers import Qwen2Tokenizer
|
||||
tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen35_tokenizer")
|
||||
super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=embedding_size, embedding_key=embedding_key, tokenizer_class=Qwen2Tokenizer,
|
||||
has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=248044, tokenizer_data=tokenizer_data)
|
||||
|
||||
|
||||
class Qwen35ImageTokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}, model_type="qwen35_2b"):
|
||||
embedding_size = QWEN35_MODELS.get(model_type, {}).get("hidden_size", 2048)
|
||||
tokenizer = lambda *a, **kw: Qwen35Tokenizer(*a, **kw, embedding_size=embedding_size, embedding_key=model_type)
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name=model_type, tokenizer=tokenizer)
|
||||
self.llama_template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
|
||||
self.llama_template_images = "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=False, **kwargs):
|
||||
image = kwargs.get("image", None)
|
||||
if image is not None and len(images) == 0:
|
||||
images = [image]
|
||||
|
||||
skip_template = False
|
||||
if text.startswith('<|im_start|>'):
|
||||
skip_template = True
|
||||
if prevent_empty_text and text == '':
|
||||
text = ' '
|
||||
|
||||
if skip_template:
|
||||
llama_text = text
|
||||
else:
|
||||
if llama_template is None:
|
||||
if len(images) > 0:
|
||||
llama_text = self.llama_template_images.format(text)
|
||||
else:
|
||||
llama_text = self.llama_template.format(text)
|
||||
else:
|
||||
llama_text = llama_template.format(text)
|
||||
if not thinking:
|
||||
llama_text += "<think>\n</think>\n"
|
||||
|
||||
tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
|
||||
key_name = next(iter(tokens))
|
||||
embed_count = 0
|
||||
qwen_tokens = tokens[key_name]
|
||||
for r in qwen_tokens:
|
||||
for i in range(len(r)):
|
||||
if r[i][0] == 248056: # <|image_pad|>
|
||||
if len(images) > embed_count:
|
||||
r[i] = ({"type": "image", "data": images[embed_count], "original_type": "image"},) + r[i][1:]
|
||||
embed_count += 1
|
||||
return tokens
|
||||
|
||||
|
||||
class Qwen35ClipModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}, model_type="qwen35_2b"):
|
||||
class Qwen35_(Qwen35):
|
||||
pass
|
||||
Qwen35_.model_type = model_type
|
||||
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={},
|
||||
dtype=dtype, special_tokens={"pad": 248044}, layer_norm_hidden_state=False,
|
||||
model_class=Qwen35_, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
|
||||
class Qwen35TEModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}, model_type="qwen35_2b"):
|
||||
clip_model = lambda **kw: Qwen35ClipModel(**kw, model_type=model_type)
|
||||
super().__init__(device=device, dtype=dtype, name=model_type, clip_model=clip_model, model_options=model_options)
|
||||
|
||||
|
||||
def tokenizer(model_type="qwen35_2b"):
|
||||
class Qwen35ImageTokenizer_(Qwen35ImageTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, model_type=model_type)
|
||||
return Qwen35ImageTokenizer_
|
||||
|
||||
|
||||
def te(dtype_llama=None, llama_quantization_metadata=None, model_type="qwen35_2b"):
|
||||
class Qwen35TEModel_(Qwen35TEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options, model_type=model_type)
|
||||
return Qwen35TEModel_
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
|
@ -425,4 +425,7 @@ class Qwen2VLVisionTransformer(nn.Module):
|
|||
hidden_states = block(hidden_states, position_embeddings, cu_seqlens_now, optimized_attention=optimized_attention)
|
||||
|
||||
hidden_states = self.merger(hidden_states)
|
||||
# Potentially important for spatially precise edits. This is present in the HF implementation.
|
||||
reverse_indices = torch.argsort(window_index)
|
||||
hidden_states = hidden_states[reverse_indices, :]
|
||||
return hidden_states
|
||||
|
|
|
|||
|
|
@ -0,0 +1,97 @@
|
|||
import re
|
||||
from comfy import sd1_clip
|
||||
|
||||
SAM3_CLIP_CONFIG = {
|
||||
"architectures": ["CLIPTextModel"],
|
||||
"hidden_act": "quick_gelu",
|
||||
"hidden_size": 1024,
|
||||
"intermediate_size": 4096,
|
||||
"num_attention_heads": 16,
|
||||
"num_hidden_layers": 24,
|
||||
"max_position_embeddings": 32,
|
||||
"projection_dim": 512,
|
||||
"vocab_size": 49408,
|
||||
"layer_norm_eps": 1e-5,
|
||||
"eos_token_id": 49407,
|
||||
}
|
||||
|
||||
|
||||
class SAM3ClipModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
super().__init__(device=device, dtype=dtype, max_length=32, layer="last", textmodel_json_config=SAM3_CLIP_CONFIG, special_tokens={"start": 49406, "end": 49407, "pad": 0}, return_projected_pooled=False, return_attention_masks=True, enable_attention_masks=True, model_options=model_options)
|
||||
|
||||
|
||||
class SAM3Tokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(max_length=32, pad_with_end=False, pad_token=0, embedding_directory=embedding_directory, embedding_size=1024, embedding_key="sam3_clip", tokenizer_data=tokenizer_data)
|
||||
self.disable_weights = True
|
||||
|
||||
|
||||
def _parse_prompts(text):
|
||||
"""Split comma-separated prompts with optional :N max detections per category"""
|
||||
text = text.replace("(", "").replace(")", "")
|
||||
parts = [p.strip() for p in text.split(",") if p.strip()]
|
||||
result = []
|
||||
for part in parts:
|
||||
m = re.match(r'^(.+?)\s*:\s*([\d.]+)\s*$', part)
|
||||
if m:
|
||||
text_part = m.group(1).strip()
|
||||
val = m.group(2)
|
||||
max_det = max(1, round(float(val)))
|
||||
result.append((text_part, max_det))
|
||||
else:
|
||||
result.append((part, 1))
|
||||
return result
|
||||
|
||||
|
||||
class SAM3TokenizerWrapper(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="l", tokenizer=SAM3Tokenizer, name="sam3_clip")
|
||||
|
||||
def tokenize_with_weights(self, text: str, return_word_ids=False, **kwargs):
|
||||
parsed = _parse_prompts(text)
|
||||
if len(parsed) <= 1 and (not parsed or parsed[0][1] == 1):
|
||||
return super().tokenize_with_weights(text, return_word_ids, **kwargs)
|
||||
# Tokenize each prompt part separately, store per-part batches and metadata
|
||||
inner = getattr(self, self.clip)
|
||||
per_prompt = []
|
||||
for prompt_text, max_det in parsed:
|
||||
batches = inner.tokenize_with_weights(prompt_text, return_word_ids, **kwargs)
|
||||
per_prompt.append((batches, max_det))
|
||||
# Main output uses first prompt's tokens (for compatibility)
|
||||
out = {self.clip_name: per_prompt[0][0], "sam3_per_prompt": per_prompt}
|
||||
return out
|
||||
|
||||
|
||||
class SAM3ClipModelWrapper(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options, clip_name="l", clip_model=SAM3ClipModel, name="sam3_clip")
|
||||
|
||||
def encode_token_weights(self, token_weight_pairs):
|
||||
per_prompt = token_weight_pairs.pop("sam3_per_prompt", None)
|
||||
if per_prompt is None:
|
||||
return super().encode_token_weights(token_weight_pairs)
|
||||
|
||||
# Encode each prompt separately, pack into extra dict
|
||||
inner = getattr(self, self.clip)
|
||||
multi_cond = []
|
||||
first_pooled = None
|
||||
for batches, max_det in per_prompt:
|
||||
out = inner.encode_token_weights(batches)
|
||||
cond, pooled = out[0], out[1]
|
||||
extra = out[2] if len(out) > 2 else {}
|
||||
if first_pooled is None:
|
||||
first_pooled = pooled
|
||||
multi_cond.append({
|
||||
"cond": cond,
|
||||
"attention_mask": extra.get("attention_mask"),
|
||||
"max_detections": max_det,
|
||||
})
|
||||
|
||||
# Return first prompt as main (for non-SAM3 consumers), all prompts in metadata
|
||||
main = multi_cond[0]
|
||||
main_extra = {}
|
||||
if main["attention_mask"] is not None:
|
||||
main_extra["attention_mask"] = main["attention_mask"]
|
||||
main_extra["sam3_multi_cond"] = multi_cond
|
||||
return (main["cond"], first_pooled, main_extra)
|
||||
|
|
@ -5,6 +5,11 @@ from comfy_api.latest._input import (
|
|||
MaskInput,
|
||||
LatentInput,
|
||||
VideoInput,
|
||||
CurvePoint,
|
||||
CurveInput,
|
||||
MonotoneCubicCurve,
|
||||
LinearCurve,
|
||||
RangeInput,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
|
|
@ -13,4 +18,9 @@ __all__ = [
|
|||
"MaskInput",
|
||||
"LatentInput",
|
||||
"VideoInput",
|
||||
"CurvePoint",
|
||||
"CurveInput",
|
||||
"MonotoneCubicCurve",
|
||||
"LinearCurve",
|
||||
"RangeInput",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -1,4 +1,6 @@
|
|||
from .basic_types import ImageInput, AudioInput, MaskInput, LatentInput
|
||||
from .curve_types import CurvePoint, CurveInput, MonotoneCubicCurve, LinearCurve
|
||||
from .range_types import RangeInput
|
||||
from .video_types import VideoInput
|
||||
|
||||
__all__ = [
|
||||
|
|
@ -7,4 +9,9 @@ __all__ = [
|
|||
"VideoInput",
|
||||
"MaskInput",
|
||||
"LatentInput",
|
||||
"CurvePoint",
|
||||
"CurveInput",
|
||||
"MonotoneCubicCurve",
|
||||
"LinearCurve",
|
||||
"RangeInput",
|
||||
]
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue