feat(api-nodes): add automatic downscaling of videos for ByteDance 2 nodes (#13465)

2026-04-21 20:45:10 +03:00 · 2026-04-21 20:45:10 +03:00 · b38dd0ff23
parent ad94d47221
commit b38dd0ff23
4 changed files with 134 additions and 18 deletions
--- a/comfy_api_nodes/apis/bytedance.py
+++ b/comfy_api_nodes/apis/bytedance.py
@ -158,10 +158,17 @@ RECOMMENDED_PRESETS_SEEDREAM_4 = [
    ("Custom", None, None),
 ]

-# Seedance 2.0 reference video pixel count limits per model.
+# Seedance 2.0 reference video pixel count limits per model and output resolution.
 SEEDANCE2_REF_VIDEO_PIXEL_LIMITS = {
-    "dreamina-seedance-2-0-260128": {"min": 409_600, "max": 927_408},
-    "dreamina-seedance-2-0-fast-260128": {"min": 409_600, "max": 927_408},
+    "dreamina-seedance-2-0-260128": {
+        "480p": {"min": 409_600, "max": 927_408},
+        "720p": {"min": 409_600, "max": 927_408},
+        "1080p": {"min": 409_600, "max": 2_073_600},
+    },
+    "dreamina-seedance-2-0-fast-260128": {
+        "480p": {"min": 409_600, "max": 927_408},
+        "720p": {"min": 409_600, "max": 927_408},
+    },
 }

 # The time in this dictionary are given for 10 seconds duration.
--- a/comfy_api_nodes/nodes_bytedance.py
+++ b/comfy_api_nodes/nodes_bytedance.py
@ -35,6 +35,7 @@ from comfy_api_nodes.util import (
    get_number_of_images,
    image_tensor_pair_to_batch,
    poll_op,
+    resize_video_to_pixel_budget,
    sync_op,
    upload_audio_to_comfyapi,
    upload_image_to_comfyapi,
@ -69,9 +70,12 @@ DEPRECATED_MODELS = {"seedance-1-0-lite-t2v-250428", "seedance-1-0-lite-i2v-2504
 logger = logging.getLogger(__name__)


-def _validate_ref_video_pixels(video: Input.Video, model_id: str, index: int) -> None:
-    """Validate reference video pixel count against Seedance 2.0 model limits."""
-    limits = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id)
+def _validate_ref_video_pixels(video: Input.Video, model_id: str, resolution: str, index: int) -> None:
+    """Validate reference video pixel count against Seedance 2.0 model limits for the selected resolution."""
+    model_limits = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id)
+    if not model_limits:
+        return
+    limits = model_limits.get(resolution)
    if not limits:
        return
    try:
@ -1373,6 +1377,14 @@ def _seedance2_reference_inputs(resolutions: list[str]):
                min=0,
            ),
        ),
+        IO.Boolean.Input(
+            "auto_downscale",
+            default=False,
+            advanced=True,
+            optional=True,
+            tooltip="Automatically downscale reference videos that exceed the model's pixel budget "
+            "for the selected resolution. Aspect ratio is preserved; videos already within limits are untouched.",
+        ),
    ]


@ -1480,10 +1492,23 @@ class ByteDance2ReferenceNode(IO.ComfyNode):

        model_id = SEEDANCE_MODELS[model["model"]]
        has_video_input = len(reference_videos) > 0
+
+        if model.get("auto_downscale") and reference_videos:
+            max_px = (
+                SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id, {})
+                .get(model["resolution"], {})
+                .get("max")
+            )
+            if max_px:
+                for key in reference_videos:
+                    reference_videos[key] = resize_video_to_pixel_budget(
+                        reference_videos[key], max_px
+                    )
+
        total_video_duration = 0.0
        for i, key in enumerate(reference_videos, 1):
            video = reference_videos[key]
-            _validate_ref_video_pixels(video, model_id, i)
+            _validate_ref_video_pixels(video, model_id, model["resolution"], i)
            try:
                dur = video.get_duration()
                if dur < 1.8:
--- a/comfy_api_nodes/util/init.py
+++ b/comfy_api_nodes/util/init.py
@ -19,6 +19,7 @@ from .conversions import (
    image_tensor_pair_to_batch,
    pil_to_bytesio,
    resize_mask_to_image,
+    resize_video_to_pixel_budget,
    tensor_to_base64_string,
    tensor_to_bytesio,
    tensor_to_pil,
@ -90,6 +91,7 @@ __all__ = [
    "image_tensor_pair_to_batch",
    "pil_to_bytesio",
    "resize_mask_to_image",
+    "resize_video_to_pixel_budget",
    "tensor_to_base64_string",
    "tensor_to_bytesio",
    "tensor_to_pil",
--- a/comfy_api_nodes/util/conversions.py
+++ b/comfy_api_nodes/util/conversions.py
@ -129,19 +129,35 @@ def pil_to_bytesio(img: Image.Image, mime_type: str = "image/png") -> BytesIO:
    return img_byte_arr


-def downscale_image_tensor(image: torch.Tensor, total_pixels: int = 1536 * 1024) -> torch.Tensor:
-    """Downscale input image tensor to roughly the specified total pixels."""
-    samples = image.movedim(-1, 1)
-    total = int(total_pixels)
-    scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
-    if scale_by >= 1:
-        return image
-    width = round(samples.shape[3] * scale_by)
-    height = round(samples.shape[2] * scale_by)
+def _compute_downscale_dims(src_w: int, src_h: int, total_pixels: int) -> tuple[int, int] | None:
+    """Return downscaled (w, h) with even dims fitting ``total_pixels``, or None if already fits.

-    s = common_upscale(samples, width, height, "lanczos", "disabled")
-    s = s.movedim(1, -1)
-    return s
+    Source aspect ratio is preserved; output may drift by a fraction of a percent because both dimensions
+    are rounded down to even values (many  codecs require divisible-by-2).
+    """
+    pixels = src_w * src_h
+    if pixels <= total_pixels:
+        return None
+    scale = math.sqrt(total_pixels / pixels)
+    new_w = max(2, int(src_w * scale))
+    new_h = max(2, int(src_h * scale))
+    new_w -= new_w % 2
+    new_h -= new_h % 2
+    return new_w, new_h
+
+
+def downscale_image_tensor(image: torch.Tensor, total_pixels: int = 1536 * 1024) -> torch.Tensor:
+    """Downscale input image tensor to roughly the specified total pixels.
+
+    Output dimensions are rounded down to even values so that the result is guaranteed to fit within ``total_pixels``
+    and is compatible with codecs that require even dimensions (e.g. yuv420p).
+    """
+    samples = image.movedim(-1, 1)
+    dims = _compute_downscale_dims(samples.shape[3], samples.shape[2], int(total_pixels))
+    if dims is None:
+        return image
+    new_w, new_h = dims
+    return common_upscale(samples, new_w, new_h, "lanczos", "disabled").movedim(1, -1)


 def downscale_image_tensor_by_max_side(image: torch.Tensor, *, max_side: int) -> torch.Tensor:
@ -399,6 +415,72 @@ def trim_video(video: Input.Video, duration_sec: float) -> Input.Video:
        raise RuntimeError(f"Failed to trim video: {str(e)}") from e


+def resize_video_to_pixel_budget(video: Input.Video, total_pixels: int) -> Input.Video:
+    """Downscale a video to fit within ``total_pixels`` (w * h), preserving aspect ratio.
+
+    Returns the original video object untouched when it already fits. Preserves frame rate, duration, and audio.
+    Aspect ratio is preserved up to a fraction of a percent (even-dim rounding).
+    """
+    src_w, src_h = video.get_dimensions()
+    scale_dims = _compute_downscale_dims(src_w, src_h, total_pixels)
+    if scale_dims is None:
+        return video
+    return _apply_video_scale(video, scale_dims)
+
+
+def _apply_video_scale(video: Input.Video, scale_dims: tuple[int, int]) -> Input.Video:
+    """Re-encode ``video`` scaled to ``scale_dims`` with a single decode/encode pass."""
+    out_w, out_h = scale_dims
+    output_buffer = BytesIO()
+    input_container = None
+    output_container = None
+
+    try:
+        input_source = video.get_stream_source()
+        input_container = av.open(input_source, mode="r")
+        output_container = av.open(output_buffer, mode="w", format="mp4")
+
+        video_stream = output_container.add_stream("h264", rate=video.get_frame_rate())
+        video_stream.width = out_w
+        video_stream.height = out_h
+        video_stream.pix_fmt = "yuv420p"
+
+        audio_stream = None
+        for stream in input_container.streams:
+            if isinstance(stream, av.AudioStream):
+                audio_stream = output_container.add_stream("aac", rate=stream.sample_rate)
+                audio_stream.sample_rate = stream.sample_rate
+                audio_stream.layout = stream.layout
+                break
+
+        for frame in input_container.decode(video=0):
+            frame = frame.reformat(width=out_w, height=out_h, format="yuv420p")
+            for packet in video_stream.encode(frame):
+                output_container.mux(packet)
+        for packet in video_stream.encode():
+            output_container.mux(packet)
+
+        if audio_stream is not None:
+            input_container.seek(0)
+            for audio_frame in input_container.decode(audio=0):
+                for packet in audio_stream.encode(audio_frame):
+                    output_container.mux(packet)
+            for packet in audio_stream.encode():
+                output_container.mux(packet)
+
+        output_container.close()
+        input_container.close()
+        output_buffer.seek(0)
+        return InputImpl.VideoFromFile(output_buffer)
+
+    except Exception as e:
+        if input_container is not None:
+            input_container.close()
+        if output_container is not None:
+            output_container.close()
+        raise RuntimeError(f"Failed to resize video: {str(e)}") from e
+
+
 def _f32_pcm(wav: torch.Tensor) -> torch.Tensor:
    """Convert audio to float 32 bits PCM format. Copy-paste from nodes_audio.py file."""
    if wav.dtype.is_floating_point: