From 6fbb6b6f49ccd1d7d336368540b71248e3701dde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Thu, 23 Apr 2026 21:13:17 +0300 Subject: [PATCH] Fix LTXV Reference Audio node (#13531) --- comfy_extras/nodes_lt.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py index d7c2e8744..19d8a387f 100644 --- a/comfy_extras/nodes_lt.py +++ b/comfy_extras/nodes_lt.py @@ -1,6 +1,7 @@ import nodes import node_helpers import torch +import torchaudio import comfy.model_management import comfy.model_sampling import comfy.samplers @@ -711,7 +712,14 @@ class LTXVReferenceAudio(io.ComfyNode): @classmethod def execute(cls, model, positive, negative, reference_audio, audio_vae, identity_guidance_scale, start_percent, end_percent) -> io.NodeOutput: # Encode reference audio to latents and patchify - audio_latents = audio_vae.encode(reference_audio) + sample_rate = reference_audio["sample_rate"] + vae_sample_rate = getattr(audio_vae, "audio_sample_rate", 44100) + if vae_sample_rate != sample_rate: + waveform = torchaudio.functional.resample(reference_audio["waveform"], sample_rate, vae_sample_rate) + else: + waveform = reference_audio["waveform"] + + audio_latents = audio_vae.encode(waveform.movedim(1, -1)) b, c, t, f = audio_latents.shape ref_tokens = audio_latents.permute(0, 2, 1, 3).reshape(b, t, c * f) ref_audio = {"tokens": ref_tokens}