[Partner Nodes] fix passing images to Grok LLM

Signed-off-by: bigcat88 <bigcat88@icloud.com>
[Partner Nodes] add new OpenRouterLLM node
2026-05-21 00:46:41 +08:00 · 2026-05-20 16:33:39 +03:00 · 2026-05-20 15:22:59 +03:00 · 2026-05-20 15:22:58 +03:00 · 2026-05-20 19:58:49 +08:00 · 2026-05-19 20:28:06 -07:00
24 changed files with 638 additions and 2907 deletions
--- a/comfy/image_encoders/dino2.py
+++ b/comfy/image_encoders/dino2.py
@ -1,13 +1,7 @@
 import torch
-import torch.nn.functional as F
-
 from comfy.text_encoders.bert import BertAttention
 import comfy.model_management
 from comfy.ldm.modules.attention import optimized_attention_for_device
-from comfy.ldm.depth_anything_3.reference_view_selector import (
-    select_reference_view, reorder_by_reference, restore_original_order,
-    THRESH_FOR_REF_SELECTION,
-)


 class Dino2AttentionOutput(torch.nn.Module):
@ -20,42 +14,13 @@ class Dino2AttentionOutput(torch.nn.Module):


 class Dino2AttentionBlock(torch.nn.Module):
-    def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations,
-                 qk_norm=False):
+    def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations):
        super().__init__()
-        self.heads = heads
-        self.head_dim = embed_dim // heads
        self.attention = BertAttention(embed_dim, heads, dtype, device, operations)
        self.output = Dino2AttentionOutput(embed_dim, embed_dim, layer_norm_eps, dtype, device, operations)
-        if qk_norm:
-            self.q_norm = operations.LayerNorm(self.head_dim, dtype=dtype, device=device)
-            self.k_norm = operations.LayerNorm(self.head_dim, dtype=dtype, device=device)
-        else:
-            self.q_norm = None
-            self.k_norm = None

-    def forward(self, x, mask, optimized_attention, pos=None, rope=None):
-        # Fast path used by the existing CLIP-vision DINOv2 (no DA3 extensions).
-        if self.q_norm is None and rope is None:
-            return self.output(self.attention(x, mask, optimized_attention))
-
-        # DA3 path: do QKV manually so we can apply per-head QK-norm and 2D RoPE.
-        attn = self.attention
-        B, N, C = x.shape
-        h = self.heads
-        d = self.head_dim
-        q = attn.query(x).view(B, N, h, d).transpose(1, 2)
-        k = attn.key(x).view(B, N, h, d).transpose(1, 2)
-        v = attn.value(x).view(B, N, h, d).transpose(1, 2)
-        if self.q_norm is not None:
-            q = self.q_norm(q)
-            k = self.k_norm(k)
-        if rope is not None and pos is not None:
-            q = rope(q, pos)
-            k = rope(k, pos)
-        out = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
-        out = out.transpose(1, 2).reshape(B, N, C)
-        return self.output(out)
+    def forward(self, x, mask, optimized_attention):
+        return self.output(self.attention(x, mask, optimized_attention))


 class LayerScale(torch.nn.Module):
@ -99,11 +64,9 @@ class SwiGLUFFN(torch.nn.Module):


 class Dino2Block(torch.nn.Module):
-    def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn,
-                 qk_norm=False):
+    def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn):
        super().__init__()
-        self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations,
-                                             qk_norm=qk_norm)
+        self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations)
        self.layer_scale1 = LayerScale(dim, dtype, device, operations)
        self.layer_scale2 = LayerScale(dim, dtype, device, operations)
        if use_swiglu_ffn:
@ -113,90 +76,19 @@ class Dino2Block(torch.nn.Module):
        self.norm1 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
        self.norm2 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)

-    def forward(self, x, optimized_attention, pos=None, rope=None, attn_mask=None):
-        x = x + self.layer_scale1(self.attention(self.norm1(x), attn_mask, optimized_attention,
-                                                 pos=pos, rope=rope))
+    def forward(self, x, optimized_attention):
+        x = x + self.layer_scale1(self.attention(self.norm1(x), None, optimized_attention))
        x = x + self.layer_scale2(self.mlp(self.norm2(x)))
        return x


-# -----------------------------------------------------------------------------
-# 2D Rotary position embedding (DA3 extension)
-# -----------------------------------------------------------------------------
-
-
-class _PositionGetter:
-    """Cache (h, w) -> flat (y, x) position grid used to feed ``rope``."""
-
-    def __init__(self):
-        self._cache: dict = {}
-
-    def __call__(self, batch_size: int, height: int, width: int, device) -> torch.Tensor:
-        key = (height, width, device)
-        if key not in self._cache:
-            y = torch.arange(height, device=device)
-            x = torch.arange(width, device=device)
-            self._cache[key] = torch.cartesian_prod(y, x)
-        cached = self._cache[key]
-        return cached.view(1, height * width, 2).expand(batch_size, -1, -1).clone()
-
-
-class RotaryPositionEmbedding2D(torch.nn.Module):
-    """2D RoPE used by DA3-Small/Base. No learnable parameters."""
-
-    def __init__(self, frequency: float = 100.0):
-        super().__init__()
-        self.base_frequency = frequency
-        self._freq_cache: dict = {}
-
-    def _components(self, dim: int, seq_len: int, device, dtype):
-        key = (dim, seq_len, device, dtype)
-        if key not in self._freq_cache:
-            exp = torch.arange(0, dim, 2, device=device).float() / dim
-            inv_freq = 1.0 / (self.base_frequency ** exp)
-            pos = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
-            ang = torch.einsum("i,j->ij", pos, inv_freq)
-            ang = ang.to(dtype)
-            ang = torch.cat((ang, ang), dim=-1)
-            self._freq_cache[key] = (ang.cos().to(dtype), ang.sin().to(dtype))
-        return self._freq_cache[key]
-
-    @staticmethod
-    def _rotate(x: torch.Tensor) -> torch.Tensor:
-        d = x.shape[-1]
-        x1, x2 = x[..., : d // 2], x[..., d // 2:]
-        return torch.cat((-x2, x1), dim=-1)
-
-    def _apply_1d(self, tokens, positions, cos_c, sin_c):
-        cos = F.embedding(positions, cos_c)[:, None, :, :]
-        sin = F.embedding(positions, sin_c)[:, None, :, :]
-        return (tokens * cos) + (self._rotate(tokens) * sin)
-
-    def forward(self, tokens: torch.Tensor, positions: torch.Tensor) -> torch.Tensor:
-        feature_dim = tokens.size(-1) // 2
-        max_pos = int(positions.max()) + 1
-        cos_c, sin_c = self._components(feature_dim, max_pos, tokens.device, tokens.dtype)
-        v, h = tokens.chunk(2, dim=-1)
-        v = self._apply_1d(v, positions[..., 0], cos_c, sin_c)
-        h = self._apply_1d(h, positions[..., 1], cos_c, sin_c)
-        return torch.cat((v, h), dim=-1)
-
-
 class Dino2Encoder(torch.nn.Module):
-    def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn,
-                 qknorm_start: int = -1):
+    def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn):
        super().__init__()
-        self.layer = torch.nn.ModuleList([
-            Dino2Block(
-                dim, num_heads, layer_norm_eps, dtype, device, operations,
-                use_swiglu_ffn=use_swiglu_ffn,
-                qk_norm=(qknorm_start != -1 and i >= qknorm_start),
-            )
-            for i in range(num_layers)
-        ])
+        self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn = use_swiglu_ffn)
+                                          for _ in range(num_layers)])

    def forward(self, x, intermediate_output=None):
-        # Backward-compat path used by ``ClipVisionModel`` (no DA3 extensions).
        optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)

        if intermediate_output is not None:
@ -230,27 +122,16 @@ class Dino2PatchEmbeddings(torch.nn.Module):


 class Dino2Embeddings(torch.nn.Module):
-    def __init__(self, dim, dtype, device, operations,
-                 patch_size: int = 14, image_size: int = 518,
-                 use_mask_token: bool = True,
-                 num_camera_tokens: int = 0):
+    def __init__(self, dim, dtype, device, operations):
        super().__init__()
+        patch_size = 14
+        image_size = 518
        self.patch_size = patch_size
-        self.image_size = image_size

        self.patch_embeddings = Dino2PatchEmbeddings(dim, patch_size=patch_size, image_size=image_size, dtype=dtype, device=device, operations=operations)
        self.position_embeddings = torch.nn.Parameter(torch.empty(1, (image_size // patch_size) ** 2 + 1, dim, dtype=dtype, device=device))
        self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device)) # mask_token is a pre-training param, kept only so strict loading accepts the key.
-        if use_mask_token:
-            self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))
-        else:
-            self.mask_token = None
-        if num_camera_tokens > 0:
-            # DA3 stores (ref_token, src_token) pairs that get injected at the
-            # alt-attn boundary; see ``Dinov2Model._inject_camera_token``.
-            self.camera_token = torch.nn.Parameter(torch.empty(1, num_camera_tokens, dim, dtype=dtype, device=device))
-        else:
-            self.camera_token = None
+        self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))

    def interpolate_pos_encoding(self, x, h_pixels, w_pixels):
        pos_embed = comfy.model_management.cast_to_device(self.position_embeddings, x.device, torch.float32)
@ -259,22 +140,12 @@ class Dino2Embeddings(torch.nn.Module):
        patch_pos = pos_embed[:, 1:]
        N = patch_pos.shape[1]
        M = int(N ** 0.5)
-        assert N == M * M, f"DINOv2 position grid must be square, got N={N} patches (sqrt={M})"
        h0 = h_pixels // self.patch_size
        w0 = w_pixels // self.patch_size
-        # +0.1 matches upstream DINOv2's FP-rounding workaround so the interpolate output size lands on (h0, w0).
-        # scale_factor is (height_scale, width_scale) -- height MUST come first;
-        # swapping these only happens to work for square inputs and breaks
-        # non-square paths like DA3-Small / DA3-Base multi-view.
-        scale_factor = ((h0 + 0.1) / M, (w0 + 0.1) / M)
+        scale_factor = ((h0 + 0.1) / M, (w0 + 0.1) / M)  # +0.1 matches upstream DINOv2's FP-rounding workaround so the interpolate output size lands on (h0, w0).

        patch_pos = patch_pos.reshape(1, M, M, -1).permute(0, 3, 1, 2)
        patch_pos = torch.nn.functional.interpolate(patch_pos, scale_factor=scale_factor, mode="bicubic", antialias=False)
-        assert (h0, w0) == patch_pos.shape[-2:], (
-            f"Interpolated pos-embed grid {tuple(patch_pos.shape[-2:])} does not match "
-            f"target patch grid ({h0}, {w0}) for input {h_pixels}x{w_pixels} (patch_size={self.patch_size}); "
-            f"check scale_factor axis order and +0.1 rounding workaround"
-        )
        patch_pos = patch_pos.permute(0, 2, 3, 1).flatten(1, 2)
        return torch.cat((class_pos, patch_pos), dim=1).to(x.dtype)

@ -290,21 +161,6 @@ class Dino2Embeddings(torch.nn.Module):


 class Dinov2Model(torch.nn.Module):
-    """DINOv2 vision backbone.
-
-    Supports two operating modes:
-
-    * **CLIP-vision DINOv2** (default): vanilla DINOv2-ViT used for
-      ``ClipVisionModel`` and SigLIP-style image encoding.
-    * **Depth Anything 3** extensions (opt-in via config keys): 2D RoPE,
-      QK-norm, alternating local/global attention, camera-token injection,
-      ``cat_token`` output and multi-layer feature extraction. These are
-      enabled when the corresponding fields (``alt_start``, ``qknorm_start``,
-      ``rope_start``, ``cat_token``) are set in ``config_dict``. When all of
-      them are at their disabled defaults this module behaves identically to
-      the historical ``Dinov2Model``.
-    """
-
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
        num_layers = config_dict["num_hidden_layers"]
@ -312,51 +168,12 @@ class Dinov2Model(torch.nn.Module):
        heads = config_dict["num_attention_heads"]
        layer_norm_eps = config_dict["layer_norm_eps"]
        use_swiglu_ffn = config_dict["use_swiglu_ffn"]
-        patch_size = config_dict.get("patch_size", 14)
-        image_size = config_dict.get("image_size", 518)
-        use_mask_token = config_dict.get("use_mask_token", True)

-        # DA3 extensions (all default to disabled).
-        self.alt_start = config_dict.get("alt_start", -1)
-        self.qknorm_start = config_dict.get("qknorm_start", -1)
-        self.rope_start = config_dict.get("rope_start", -1)
-        self.cat_token = config_dict.get("cat_token", False)
-        rope_freq = config_dict.get("rope_freq", 100.0)
-
-        self.embed_dim = dim
-        self.patch_size = patch_size
-        self.num_register_tokens = 0
-        self.patch_start_idx = 1
-
-        if self.rope_start != -1 and rope_freq > 0:
-            self.rope = RotaryPositionEmbedding2D(frequency=rope_freq)
-            self._position_getter = _PositionGetter()
-        else:
-            self.rope = None
-            self._position_getter = None
-
-        # camera_token shape: (1, 2, dim) -> (ref_token, src_token).
-        num_cam_tokens = 2 if self.alt_start != -1 else 0
-
-        self.embeddings = Dino2Embeddings(
-            dim, dtype, device, operations,
-            patch_size=patch_size, image_size=image_size,
-            use_mask_token=use_mask_token, num_camera_tokens=num_cam_tokens,
-        )
-        self.encoder = Dino2Encoder(
-            dim, heads, layer_norm_eps, num_layers, dtype, device, operations,
-            use_swiglu_ffn=use_swiglu_ffn,
-            qknorm_start=self.qknorm_start,
-        )
+        self.embeddings = Dino2Embeddings(dim, dtype, device, operations)
+        self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn = use_swiglu_ffn)
        self.layernorm = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)

    def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
-        if self.alt_start != -1:
-            raise RuntimeError(
-                "Dinov2Model.forward() is the backward-compatible CLIP-vision path and does not "
-                "apply DA3 extensions (RoPE, alternating attention, camera-token injection). "
-                "Use get_intermediate_layers_da3() for Depth Anything 3 models."
-            )
        x = self.embeddings(pixel_values)
        x, i = self.encoder(x, intermediate_output=intermediate_output)
        x = self.layernorm(x)
@ -364,21 +181,6 @@ class Dinov2Model(torch.nn.Module):
        return x, i, pooled_output, None

    def get_intermediate_layers(self, pixel_values, indices, apply_norm=True):
-        """Single-view multi-layer feature extraction (MoGe / vanilla DINOv2).
-
-        For the multi-view Depth Anything 3 path (RoPE, alt-attention,
-        camera-token injection, ref-view selection, cat_token), use
-        :meth:`get_intermediate_layers_da3` instead.
-
-        Args:
-            pixel_values: ``(B, 3, H, W)`` single-view input.
-            indices: layer indices to extract; supports negative indexing.
-            apply_norm: if True, apply the final layernorm to each output.
-
-        Returns:
-            list of ``(patch_tokens, cls_token)`` tuples with shapes
-            ``(B, N_patch, C)`` and ``(B, C)`` (one entry per ``indices``).
-        """
        x = self.embeddings(pixel_values)
        optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
        n_layers = len(self.encoder.layer)
@ -395,166 +197,3 @@ class Dinov2Model(torch.nn.Module):
            if i >= max_idx:
                break
        return [cache[i] for i in resolved]
-
-    # ------------------------------------------------------------------
-    # Depth Anything 3 forward
-    # ------------------------------------------------------------------
-    def _prepare_rope_positions(self, B, S, H, W, device):
-        if self.rope is None:
-            return None, None
-        ph, pw = H // self.patch_size, W // self.patch_size
-        pos = self._position_getter(B * S, ph, pw, device=device)
-        # Shift so the cls/cam token at position 0 is reserved for "no diff".
-        pos = pos + 1
-        cls_pos = torch.zeros(B * S, self.patch_start_idx, 2, device=device, dtype=pos.dtype)
-        # Per-view local: real grid positions for patches, 0 for cls token.
-        pos_local = torch.cat([cls_pos, pos], dim=1)
-        # Global (across views): same grid positions; cls token still at 0,
-        # but patches share the same positions in every view.
-        pos_global = torch.cat([cls_pos, torch.zeros_like(pos) + 1], dim=1)
-        return pos_local, pos_global
-
-    def _inject_camera_token(self, x: torch.Tensor, B: int, S: int,
-                             cam_token: "torch.Tensor | None") -> torch.Tensor:
-        # x: (B, S, N, C). Replace token at index 0 with the camera token.
-        if cam_token is not None:
-            inj = cam_token
-        else:
-            ct = comfy.model_management.cast_to_device(self.embeddings.camera_token, x.device, x.dtype)
-            ref_token = ct[:, :1].expand(B, -1, -1)
-            src_token = ct[:, 1:].expand(B, max(S - 1, 0), -1)
-            inj = torch.cat([ref_token, src_token], dim=1)
-        x = x.clone()
-        x[:, :, 0] = inj
-        return x
-
-    def get_intermediate_layers_da3(self, pixel_values, out_layers, cam_token=None,
-                                ref_view_strategy="saddle_balanced",
-                                export_feat_layers=None):
-        """Multi-view multi-layer feature extraction used by Depth Anything 3.
-
-        Adds RoPE positions, alternating local/global attention across views,
-        camera-token injection, reference-view selection/reordering,
-        ``cat_token`` output and optional auxiliary feature exports on top of
-        the vanilla DINOv2 path. For the single-view MoGe / CLIP-vision use
-        case, see :meth:`get_intermediate_layers`.
-
-        Args:
-            pixel_values: ``(B, S, 3, H, W)`` views or ``(B, 3, H, W)``.
-            out_layers: indices into ``self.encoder.layer``.
-            cam_token: optional ``(B, S, dim)`` camera token to inject at
-                ``alt_start``. If ``None`` and the model has its own
-                ``camera_token`` parameter, that is used.
-            ref_view_strategy: when ``S >= 3`` and ``cam_token is None``,
-                pick a reference view via this strategy and move it to
-                position 0 right before the first alt-attention block.
-                The original view order is restored on the way out.
-            export_feat_layers: optional iterable of layer indices whose
-                local attention outputs to also return as auxiliary
-                features (``(B, S, N_patch, C)`` after final norm). Used
-                by the multi-view path to expose intermediate features
-                to the nested-architecture wrapper.
-
-        Returns:
-            ``(layer_outputs, aux_outputs)`` where ``layer_outputs`` is a
-            list of ``(patch_tokens, cls_or_cam_token)`` tuples (one per
-            ``out_layers`` entry) and ``aux_outputs`` is a list of
-            ``(B, S, N_patch, C)`` features for ``export_feat_layers``
-            (empty list when not requested).
-        """
-        if pixel_values.ndim == 4:
-            pixel_values = pixel_values.unsqueeze(1)
-        assert pixel_values.ndim == 5 and pixel_values.shape[2] == 3, \
-            f"expected (B,3,H,W) or (B,S,3,H,W); got {tuple(pixel_values.shape)}"
-        B, S, _, H, W = pixel_values.shape
-
-        # Patch + cls + (interpolated) pos embed for each view.
-        x = pixel_values.reshape(B * S, 3, H, W)
-        x = self.embeddings(x)                          # (B*S, 1+N, C)
-        x = x.reshape(B, S, x.shape[-2], x.shape[-1])    # (B, S, 1+N, C)
-
-        pos_local, pos_global = self._prepare_rope_positions(B, S, H, W, x.device)
-        # ``optimized_attention`` is only used by blocks without QK-norm/RoPE
-        # (vanilla DINOv2 path); enabling-aware blocks fall through to SDPA.
-        optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
-
-        out_set = set(out_layers)
-        export_set = set(export_feat_layers) if export_feat_layers else set()
-        outputs: list[torch.Tensor] = []
-        aux_outputs: list[torch.Tensor] = []
-        local_x = x
-        b_idx = None
-
-
-        for i, blk in enumerate(self.encoder.layer):
-            apply_rope = self.rope is not None and i >= self.rope_start
-            block_rope = self.rope if apply_rope else None
-            l_pos = pos_local if apply_rope else None
-            g_pos = pos_global if apply_rope else None
-
-            # Reference-view selection threshold: matches the upstream constant
-            # ``THRESH_FOR_REF_SELECTION = 3``. Skipped when a user-supplied
-            # cam_token is provided (camera info already pins the geometry).
-            if (self.alt_start != -1 and i == self.alt_start - 1
-                    and S >= THRESH_FOR_REF_SELECTION and cam_token is None):
-                b_idx = select_reference_view(x, strategy=ref_view_strategy)
-                x = reorder_by_reference(x, b_idx)
-                local_x = reorder_by_reference(local_x, b_idx)
-
-            if self.alt_start != -1 and i == self.alt_start:
-                x = self._inject_camera_token(x, B, S, cam_token)
-
-            if self.alt_start != -1 and i >= self.alt_start and (i % 2 == 1):
-                # Global attention across views: flatten S into the seq dim.
-                t = x.reshape(B, S * x.shape[-2], x.shape[-1])
-                p = g_pos.reshape(B, S * g_pos.shape[-2], g_pos.shape[-1]) if g_pos is not None else None
-                t = blk(t, optimized_attention=optimized_attention, pos=p, rope=block_rope)
-                x = t.reshape(B, S, x.shape[-2], x.shape[-1])
-            else:
-                # Per-view local attention.
-                t = x.reshape(B * S, x.shape[-2], x.shape[-1])
-                p = l_pos.reshape(B * S, l_pos.shape[-2], l_pos.shape[-1]) if l_pos is not None else None
-                t = blk(t, optimized_attention=optimized_attention, pos=p, rope=block_rope)
-                x = t.reshape(B, S, x.shape[-2], x.shape[-1])
-                local_x = x
-
-            if i in out_set:
-                if self.cat_token:
-                    out_x = torch.cat([local_x, x], dim=-1)
-                else:
-                    out_x = x
-                # Restore original view order on the way out so heads see views
-                # in the user's expected order.
-                if b_idx is not None and self.alt_start != -1:
-                    out_x = restore_original_order(out_x, b_idx)
-                outputs.append(out_x)
-
-            if i in export_set:
-                aux = x
-                if b_idx is not None and self.alt_start != -1:
-                    aux = restore_original_order(aux, b_idx)
-                aux_outputs.append(aux)
-
-        # Apply final norm. When ``cat_token`` is set, only the right half
-        # ("global" features) is normalised; the left half is left as-is to
-        # match the upstream DA3 head signature.
-        normed: list[torch.Tensor] = []
-        cls_tokens: list[torch.Tensor] = []
-        for out_x in outputs:
-            cls_tokens.append(out_x[:, :, 0])
-            if out_x.shape[-1] == self.embed_dim:
-                normed.append(self.layernorm(out_x))
-            elif out_x.shape[-1] == self.embed_dim * 2:
-                left = out_x[..., :self.embed_dim]
-                right = self.layernorm(out_x[..., self.embed_dim:])
-                normed.append(torch.cat([left, right], dim=-1))
-            else:
-                raise ValueError(f"Unexpected token width: {out_x.shape[-1]}")
-
-        # Drop cls/cam token from the patch sequence.
-        normed = [o[..., 1 + self.num_register_tokens:, :] for o in normed]
-
-        # Final layernorm + drop cls token from auxiliary features too.
-        aux_normed = [self.layernorm(o)[..., 1 + self.num_register_tokens:, :]
-                      for o in aux_outputs]
-        return list(zip(normed, cls_tokens)), aux_normed
--- a/comfy/ldm/depth_anything_3/init.py
+++ b/comfy/ldm/depth_anything_3/init.py
@ -1,7 +0,0 @@
-# Depth Anything 3 - native ComfyUI port (Apache-2.0 monocular variants only).
-#
-# Supported variants:
-#   DA3-Small, DA3-Base                (vits/vitb backbone, DualDPT head)
-#   DA3Mono-Large, DA3Metric-Large     (vitl backbone, DPT head + sky mask)
-#
-# Original repo: https://github.com/ByteDance-Seed/Depth-Anything-3
--- a/comfy/ldm/depth_anything_3/camera.py
+++ b/comfy/ldm/depth_anything_3/camera.py
@ -1,204 +0,0 @@
-"""Camera-token encoder and decoder for Depth Anything 3.
-
-* :class:`CameraEnc` takes per-view extrinsics + intrinsics and produces a
-  per-view camera token that gets injected at the alt-attention boundary
-  in the DINOv2 backbone (block ``alt_start``).
-* :class:`CameraDec` takes the final-layer camera token output by the
-  backbone and predicts a 9-D pose encoding (translation, quaternion,
-  field-of-view).
-
-The module/parameter names match the upstream ``cam_enc.py``/``cam_dec.py``
-so HF safetensors load directly with no key remapping (the upstream uses
-fused QKV linears, which we replicate here).
-"""
-
-from __future__ import annotations
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .transform import affine_inverse, extri_intri_to_pose_encoding
-
-
-# -----------------------------------------------------------------------------
-# Building blocks (mirror ``depth_anything_3.model.utils.{attention,block}``)
-# -----------------------------------------------------------------------------
-
-
-class _Mlp(nn.Module):
-    """Standard 2-layer MLP with GELU. Matches upstream ``utils.attention.Mlp``."""
-
-    def __init__(self, in_features, hidden_features=None, out_features=None,
-                 *, device=None, dtype=None, operations=None):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = operations.Linear(in_features, hidden_features, bias=True,
-                                     device=device, dtype=dtype)
-        self.fc2 = operations.Linear(hidden_features, out_features, bias=True,
-                                     device=device, dtype=dtype)
-
-    def forward(self, x):
-        return self.fc2(F.gelu(self.fc1(x)))
-
-
-class _LayerScale(nn.Module):
-    """Per-channel learnable scaling. Matches upstream ``LayerScale``."""
-
-    def __init__(self, dim, *, device=None, dtype=None):
-        super().__init__()
-        self.gamma = nn.Parameter(torch.empty(dim, device=device, dtype=dtype))
-
-    def forward(self, x):
-        return x * self.gamma.to(dtype=x.dtype, device=x.device)
-
-
-class _Attention(nn.Module):
-    """Self-attention with fused QKV projection.
-
-    Mirrors upstream ``utils.attention.Attention``; layout matches the
-    HF safetensors (``attn.qkv.{weight,bias}`` and ``attn.proj.{weight,bias}``).
-    """
-
-    def __init__(self, dim, num_heads,
-                 *, device=None, dtype=None, operations=None):
-        super().__init__()
-        assert dim % num_heads == 0
-        self.num_heads = num_heads
-        self.head_dim = dim // num_heads
-        self.qkv = operations.Linear(dim, dim * 3, bias=True,
-                                     device=device, dtype=dtype)
-        self.proj = operations.Linear(dim, dim, bias=True,
-                                      device=device, dtype=dtype)
-
-    def forward(self, x):
-        B, N, C = x.shape
-        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
-        qkv = qkv.permute(2, 0, 3, 1, 4)            # 3, B, h, N, d
-        q, k, v = qkv.unbind(0)
-        out = F.scaled_dot_product_attention(q, k, v)
-        out = out.transpose(1, 2).reshape(B, N, C)
-        return self.proj(out)
-
-
-class _Block(nn.Module):
-    """Pre-norm transformer block with LayerScale.
-
-    Used by :class:`CameraEnc`. Layout follows upstream ``utils.block.Block``.
-    """
-
-    def __init__(self, dim, num_heads, mlp_ratio=4, init_values=0.01,
-                 *, device=None, dtype=None, operations=None):
-        super().__init__()
-        self.norm1 = operations.LayerNorm(dim, device=device, dtype=dtype)
-        self.attn = _Attention(dim, num_heads,
-                               device=device, dtype=dtype, operations=operations)
-        self.ls1 = _LayerScale(dim, device=device, dtype=dtype) if init_values else nn.Identity()
-        self.norm2 = operations.LayerNorm(dim, device=device, dtype=dtype)
-        self.mlp = _Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio),
-                        device=device, dtype=dtype, operations=operations)
-        self.ls2 = _LayerScale(dim, device=device, dtype=dtype) if init_values else nn.Identity()
-
-    def forward(self, x):
-        x = x + self.ls1(self.attn(self.norm1(x)))
-        x = x + self.ls2(self.mlp(self.norm2(x)))
-        return x
-
-
-class CameraEnc(nn.Module):
-    """Encode per-view (extrinsics, intrinsics) into a camera token.
-
-    Maps a 9-D pose-encoding vector through a small MLP up to the backbone's
-    ``embed_dim``, then runs ``trunk_depth`` transformer blocks. The output
-    has shape ``(B, S, embed_dim)`` and is injected at block ``alt_start``
-    of the DINOv2 backbone in place of the cls token.
-
-    Parameters mirror the upstream ``cam_enc.py`` so HF weights load directly.
-    """
-
-    def __init__(
-        self,
-        dim_out: int = 1024,
-        dim_in: int = 9,
-        trunk_depth: int = 4,
-        target_dim: int = 9,
-        num_heads: int = 16,
-        mlp_ratio: int = 4,
-        init_values: float = 0.01,
-        *,
-        device=None, dtype=None, operations=None,
-        **_kwargs,
-    ):
-        super().__init__()
-        self.target_dim = target_dim
-        self.trunk_depth = trunk_depth
-        self.trunk = nn.Sequential(*[
-            _Block(dim_out, num_heads=num_heads, mlp_ratio=mlp_ratio,
-                   init_values=init_values,
-                   device=device, dtype=dtype, operations=operations)
-            for _ in range(trunk_depth)
-        ])
-        self.token_norm = operations.LayerNorm(dim_out, device=device, dtype=dtype)
-        self.trunk_norm = operations.LayerNorm(dim_out, device=device, dtype=dtype)
-        self.pose_branch = _Mlp(
-            in_features=dim_in,
-            hidden_features=dim_out // 2,
-            out_features=dim_out,
-            device=device, dtype=dtype, operations=operations,
-        )
-
-    def forward(self, extrinsics: torch.Tensor, intrinsics: torch.Tensor,
-                image_size_hw) -> torch.Tensor:
-        """Encode camera parameters into ``(B, S, dim_out)`` tokens."""
-        c2ws = affine_inverse(extrinsics)
-        pose_encoding = extri_intri_to_pose_encoding(c2ws, intrinsics, image_size_hw)
-        tokens = self.pose_branch(pose_encoding.to(self.pose_branch.fc1.weight.dtype))
-        tokens = self.token_norm(tokens)
-        tokens = self.trunk(tokens)
-        tokens = self.trunk_norm(tokens)
-        return tokens
-
-
-class CameraDec(nn.Module):
-    """Decode the final cam token into a 9-D pose encoding.
-
-    Output layout: ``[T(3), quat_xyzw(4), fov_h, fov_w]``. The translation is
-    always predicted by the network; the quaternion and FoV can either be
-    predicted or supplied via ``camera_encoding`` (used at training time
-    when GT cameras are available -- not exercised at inference here).
-
-    Parameters mirror the upstream ``cam_dec.py`` so HF weights load directly.
-    """
-
-    def __init__(self, dim_in: int = 1536,
-                 *, device=None, dtype=None, operations=None, **_kwargs):
-        super().__init__()
-        d = dim_in
-        self.backbone = nn.Sequential(
-            operations.Linear(d, d, device=device, dtype=dtype),
-            nn.ReLU(),
-            operations.Linear(d, d, device=device, dtype=dtype),
-            nn.ReLU(),
-        )
-        self.fc_t = operations.Linear(d, 3, device=device, dtype=dtype)
-        self.fc_qvec = operations.Linear(d, 4, device=device, dtype=dtype)
-        self.fc_fov = nn.Sequential(
-            operations.Linear(d, 2, device=device, dtype=dtype),
-            nn.ReLU(),
-        )
-
-    def forward(self, feat: torch.Tensor,
-                camera_encoding: "torch.Tensor | None" = None) -> torch.Tensor:
-        """Decode ``(B, N, dim_in)`` cam tokens into ``(B, N, 9)`` pose enc."""
-        B, N = feat.shape[:2]
-        feat = feat.reshape(B * N, -1)
-        feat = self.backbone(feat)
-        out_t = self.fc_t(feat.float()).reshape(B, N, 3)
-        if camera_encoding is None:
-            out_qvec = self.fc_qvec(feat.float()).reshape(B, N, 4)
-            out_fov = self.fc_fov(feat.float()).reshape(B, N, 2)
-        else:
-            out_qvec = camera_encoding[..., 3:7]
-            out_fov = camera_encoding[..., -2:]
-        return torch.cat([out_t, out_qvec, out_fov], dim=-1)
--- a/comfy/ldm/depth_anything_3/dpt.py
+++ b/comfy/ldm/depth_anything_3/dpt.py
@ -1,549 +0,0 @@
-# DPT / DualDPT heads for Depth Anything 3.
-#
-# Ported from:
-#   src/depth_anything_3/model/dpt.py        (DPT - single main head + sky head)
-#   src/depth_anything_3/model/dualdpt.py    (DualDPT - depth + auxiliary "ray" head)
-#
-# In the monocular path we always discard the auxiliary "ray" output of
-# DualDPT. The auxiliary branch is still constructed so that DA3 HF weights
-# load cleanly without missing-key warnings.
-
-from __future__ import annotations
-
-from typing import List, Optional, Sequence, Tuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-# -----------------------------------------------------------------------------
-# Helpers (matching upstream head_utils.py)
-# -----------------------------------------------------------------------------
-
-
-class Permute(nn.Module):
-    def __init__(self, dims: Tuple[int, ...]):
-        super().__init__()
-        self.dims = dims
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return x.permute(*self.dims)
-
-
-def _custom_interpolate(
-    x: torch.Tensor,
-    size: Optional[Tuple[int, int]] = None,
-    scale_factor: Optional[float] = None,
-    mode: str = "bilinear",
-    align_corners: bool = True,
-) -> torch.Tensor:
-    if size is None:
-        assert scale_factor is not None
-        size = (int(x.shape[-2] * scale_factor), int(x.shape[-1] * scale_factor))
-    INT_MAX = 1610612736
-    total = size[0] * size[1] * x.shape[0] * x.shape[1]
-    if total > INT_MAX:
-        chunks = torch.chunk(x, chunks=(total // INT_MAX) + 1, dim=0)
-        outs = [F.interpolate(c, size=size, mode=mode, align_corners=align_corners) for c in chunks]
-        return torch.cat(outs, dim=0).contiguous()
-    return F.interpolate(x, size=size, mode=mode, align_corners=align_corners)
-
-
-def _create_uv_grid(width: int, height: int, aspect_ratio: float,
-                    dtype, device) -> torch.Tensor:
-    """Normalised UV grid spanning (-x_span, -y_span)..(x_span, y_span)."""
-    diag_factor = (aspect_ratio ** 2 + 1.0) ** 0.5
-    span_x = aspect_ratio / diag_factor
-    span_y = 1.0 / diag_factor
-    left_x = -span_x * (width - 1) / width
-    right_x = span_x * (width - 1) / width
-    top_y = -span_y * (height - 1) / height
-    bottom_y = span_y * (height - 1) / height
-    x_coords = torch.linspace(left_x, right_x, steps=width, dtype=dtype, device=device)
-    y_coords = torch.linspace(top_y, bottom_y, steps=height, dtype=dtype, device=device)
-    uu, vv = torch.meshgrid(x_coords, y_coords, indexing="xy")
-    return torch.stack((uu, vv), dim=-1)  # (H, W, 2)
-
-
-def _make_sincos_pos_embed(embed_dim: int, pos: torch.Tensor, omega_0: float = 100.0) -> torch.Tensor:
-    omega = torch.arange(embed_dim // 2, dtype=torch.float32, device=pos.device)
-    omega = 1.0 / omega_0 ** (omega / (embed_dim / 2.0))
-    pos = pos.reshape(-1)
-    out = torch.einsum("m,d->md", pos, omega)
-    return torch.cat([out.sin(), out.cos()], dim=1).float()
-
-
-def _position_grid_to_embed(pos_grid: torch.Tensor, embed_dim: int,
-                            omega_0: float = 100.0) -> torch.Tensor:
-    H, W, _ = pos_grid.shape
-    pos_flat = pos_grid.reshape(-1, 2)
-    emb_x = _make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 0], omega_0=omega_0)
-    emb_y = _make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 1], omega_0=omega_0)
-    emb = torch.cat([emb_x, emb_y], dim=-1)
-    return emb.view(H, W, embed_dim)
-
-
-def _add_pos_embed(x: torch.Tensor, W: int, H: int, ratio: float = 0.1) -> torch.Tensor:
-    """Stateless UV positional embedding added to a feature map (B, C, h, w)."""
-    pw, ph = x.shape[-1], x.shape[-2]
-    pe = _create_uv_grid(pw, ph, aspect_ratio=W / H, dtype=x.dtype, device=x.device)
-    pe = _position_grid_to_embed(pe, x.shape[1]) * ratio
-    pe = pe.permute(2, 0, 1)[None].expand(x.shape[0], -1, -1, -1).to(dtype=x.dtype)
-    return x + pe
-
-
-def _apply_activation(x: torch.Tensor, activation: str) -> torch.Tensor:
-    act = (activation or "linear").lower()
-    if act == "exp":
-        return torch.exp(x)
-    if act == "expp1":
-        return torch.exp(x) + 1
-    if act == "expm1":
-        return torch.expm1(x)
-    if act == "relu":
-        return torch.relu(x)
-    if act == "sigmoid":
-        return torch.sigmoid(x)
-    if act == "softplus":
-        return F.softplus(x)
-    if act == "tanh":
-        return torch.tanh(x)
-    return x
-
-
-# -----------------------------------------------------------------------------
-# Fusion building blocks
-# -----------------------------------------------------------------------------
-
-
-class ResidualConvUnit(nn.Module):
-    def __init__(self, features: int,
-                 device=None, dtype=None, operations=None):
-        super().__init__()
-        self.conv1 = operations.Conv2d(features, features, 3, 1, 1, bias=True,
-                                       device=device, dtype=dtype)
-        self.conv2 = operations.Conv2d(features, features, 3, 1, 1, bias=True,
-                                       device=device, dtype=dtype)
-        self.activation = nn.ReLU(inplace=False)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        out = self.activation(x)
-        out = self.conv1(out)
-        out = self.activation(out)
-        out = self.conv2(out)
-        return out + x
-
-
-class FeatureFusionBlock(nn.Module):
-    def __init__(self, features: int, has_residual: bool = True,
-                 align_corners: bool = True,
-                 device=None, dtype=None, operations=None):
-        super().__init__()
-        self.align_corners = align_corners
-        self.has_residual = has_residual
-        if has_residual:
-            self.resConfUnit1 = ResidualConvUnit(features, device=device, dtype=dtype, operations=operations)
-        else:
-            self.resConfUnit1 = None
-        self.resConfUnit2 = ResidualConvUnit(features, device=device, dtype=dtype, operations=operations)
-        self.out_conv = operations.Conv2d(features, features, 1, 1, 0, bias=True,
-                                          device=device, dtype=dtype)
-
-    def forward(self, *xs: torch.Tensor, size: Optional[Tuple[int, int]] = None) -> torch.Tensor:
-        y = xs[0]
-        if self.has_residual and len(xs) > 1 and self.resConfUnit1 is not None:
-            y = y + self.resConfUnit1(xs[1])
-        y = self.resConfUnit2(y)
-        if size is None:
-            up_kwargs = {"scale_factor": 2.0}
-        else:
-            up_kwargs = {"size": size}
-        y = _custom_interpolate(y, **up_kwargs, mode="bilinear",
-                                align_corners=self.align_corners)
-        y = self.out_conv(y)
-        return y
-
-
-class _Scratch(nn.Module):
-    """Container that mirrors upstream ``scratch`` attribute layout."""
-
-
-def _make_scratch(in_shape: List[int], out_shape: int,
-                  device=None, dtype=None, operations=None) -> _Scratch:
-    scratch = _Scratch()
-    scratch.layer1_rn = operations.Conv2d(in_shape[0], out_shape, 3, 1, 1, bias=False,
-                                          device=device, dtype=dtype)
-    scratch.layer2_rn = operations.Conv2d(in_shape[1], out_shape, 3, 1, 1, bias=False,
-                                          device=device, dtype=dtype)
-    scratch.layer3_rn = operations.Conv2d(in_shape[2], out_shape, 3, 1, 1, bias=False,
-                                          device=device, dtype=dtype)
-    scratch.layer4_rn = operations.Conv2d(in_shape[3], out_shape, 3, 1, 1, bias=False,
-                                          device=device, dtype=dtype)
-    return scratch
-
-
-def _make_fusion_block(features: int, has_residual: bool = True,
-                       device=None, dtype=None, operations=None) -> FeatureFusionBlock:
-    return FeatureFusionBlock(features, has_residual=has_residual,
-                              align_corners=True,
-                              device=device, dtype=dtype, operations=operations)
-
-
-# -----------------------------------------------------------------------------
-# DPT (single head + optional sky head) -- used by DA3Mono/Metric
-# -----------------------------------------------------------------------------
-
-
-class DPT(nn.Module):
-    """Single-head DPT used by DA3Mono-Large and DA3Metric-Large."""
-
-    def __init__(
-        self,
-        dim_in: int,
-        patch_size: int = 14,
-        output_dim: int = 1,
-        activation: str = "exp",
-        conf_activation: str = "expp1",
-        features: int = 256,
-        out_channels: Sequence[int] = (256, 512, 1024, 1024),
-        pos_embed: bool = False,
-        down_ratio: int = 1,
-        head_name: str = "depth",
-        use_sky_head: bool = True,
-        sky_name: str = "sky",
-        sky_activation: str = "relu",
-        norm_type: str = "idt",
-        device=None, dtype=None, operations=None,
-    ):
-        super().__init__()
-        self.patch_size = patch_size
-        self.activation = activation
-        self.conf_activation = conf_activation
-        self.pos_embed = pos_embed
-        self.down_ratio = down_ratio
-        self.head_main = head_name
-        self.sky_name = sky_name
-        self.out_dim = output_dim
-        self.has_conf = output_dim > 1
-        self.use_sky_head = use_sky_head
-        self.sky_activation = sky_activation
-        self.intermediate_layer_idx: Tuple[int, int, int, int] = (0, 1, 2, 3)
-
-        if norm_type == "layer":
-            self.norm = operations.LayerNorm(dim_in, device=device, dtype=dtype)
-        else:
-            self.norm = nn.Identity()
-
-        out_channels = list(out_channels)
-        self.projects = nn.ModuleList([
-            operations.Conv2d(dim_in, oc, kernel_size=1, stride=1, padding=0,
-                              device=device, dtype=dtype)
-            for oc in out_channels
-        ])
-        self.resize_layers = nn.ModuleList([
-            operations.ConvTranspose2d(out_channels[0], out_channels[0], kernel_size=4, stride=4, padding=0,
-                                       device=device, dtype=dtype),
-            operations.ConvTranspose2d(out_channels[1], out_channels[1], kernel_size=2, stride=2, padding=0,
-                                       device=device, dtype=dtype),
-            nn.Identity(),
-            operations.Conv2d(out_channels[3], out_channels[3], kernel_size=3, stride=2, padding=1,
-                              device=device, dtype=dtype),
-        ])
-
-        self.scratch = _make_scratch(out_channels, features,
-                                     device=device, dtype=dtype, operations=operations)
-        self.scratch.refinenet1 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
-        self.scratch.refinenet2 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
-        self.scratch.refinenet3 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
-        self.scratch.refinenet4 = _make_fusion_block(features, has_residual=False,
-                                                    device=device, dtype=dtype, operations=operations)
-
-        head_features_1 = features
-        head_features_2 = 32
-        self.scratch.output_conv1 = operations.Conv2d(
-            head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1,
-            device=device, dtype=dtype,
-        )
-        self.scratch.output_conv2 = nn.Sequential(
-            operations.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1,
-                              device=device, dtype=dtype),
-            nn.ReLU(inplace=False),
-            operations.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0,
-                              device=device, dtype=dtype),
-        )
-
-        if self.use_sky_head:
-            self.scratch.sky_output_conv2 = nn.Sequential(
-                operations.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1,
-                                  device=device, dtype=dtype),
-                nn.ReLU(inplace=False),
-                operations.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0,
-                                  device=device, dtype=dtype),
-            )
-
-    def forward(self, feats: List[torch.Tensor], H: int, W: int,
-                patch_start_idx: int = 0, **_kwargs) -> dict:
-        # feats[i][0] is the patch-token tensor with shape (B, S, N_patch, C)
-        B, S, N, C = feats[0][0].shape
-        feats_flat = [feat[0].reshape(B * S, N, C) for feat in feats]
-
-        ph, pw = H // self.patch_size, W // self.patch_size
-        resized = []
-        for stage_idx, take_idx in enumerate(self.intermediate_layer_idx):
-            x = feats_flat[take_idx][:, patch_start_idx:]
-            x = self.norm(x)
-            x = x.permute(0, 2, 1).contiguous().reshape(B * S, C, ph, pw)
-            x = self.projects[stage_idx](x)
-            if self.pos_embed:
-                x = _add_pos_embed(x, W, H)
-            x = self.resize_layers[stage_idx](x)
-            resized.append(x)
-
-        l1_rn = self.scratch.layer1_rn(resized[0])
-        l2_rn = self.scratch.layer2_rn(resized[1])
-        l3_rn = self.scratch.layer3_rn(resized[2])
-        l4_rn = self.scratch.layer4_rn(resized[3])
-
-        out = self.scratch.refinenet4(l4_rn, size=l3_rn.shape[2:])
-        out = self.scratch.refinenet3(out, l3_rn, size=l2_rn.shape[2:])
-        out = self.scratch.refinenet2(out, l2_rn, size=l1_rn.shape[2:])
-        out = self.scratch.refinenet1(out, l1_rn)
-
-        h_out = int(ph * self.patch_size / self.down_ratio)
-        w_out = int(pw * self.patch_size / self.down_ratio)
-
-        fused = self.scratch.output_conv1(out)
-        fused = _custom_interpolate(fused, (h_out, w_out), mode="bilinear", align_corners=True)
-        if self.pos_embed:
-            fused = _add_pos_embed(fused, W, H)
-        feat = fused
-
-        main_logits = self.scratch.output_conv2(feat)
-        outs = {}
-        if self.has_conf:
-            fmap = main_logits.permute(0, 2, 3, 1)
-            pred = _apply_activation(fmap[..., :-1], self.activation)
-            conf = _apply_activation(fmap[..., -1], self.conf_activation)
-            outs[self.head_main] = pred.squeeze(-1).view(B, S, *pred.shape[1:-1])
-            outs[f"{self.head_main}_conf"] = conf.view(B, S, *conf.shape[1:])
-        else:
-            pred = _apply_activation(main_logits, self.activation)
-            outs[self.head_main] = pred.squeeze(1).view(B, S, *pred.shape[2:])
-
-        if self.use_sky_head:
-            sky_logits = self.scratch.sky_output_conv2(feat)
-            if self.sky_activation.lower() == "sigmoid":
-                sky = torch.sigmoid(sky_logits)
-            elif self.sky_activation.lower() == "relu":
-                sky = F.relu(sky_logits)
-            else:
-                sky = sky_logits
-            outs[self.sky_name] = sky.squeeze(1).view(B, S, *sky.shape[2:])
-
-        return outs
-
-
-# -----------------------------------------------------------------------------
-# DualDPT (depth + auxiliary "ray" head) -- used by DA3-Small / DA3-Base
-# -----------------------------------------------------------------------------
-
-
-class DualDPT(nn.Module):
-    """Two-head DPT used by DA3-Small / DA3-Base.
-
-    The auxiliary "ray" head is constructed so that HF state-dict keys load
-    cleanly. It is only executed when :attr:`enable_aux` is set on the
-    instance (typically by ``DepthAnything3Net`` when running multi-view
-    with ``use_ray_pose=True``); otherwise the monocular path skips it for
-    speed and the auxiliary submodules sit idle.
-    """
-
-    def __init__(
-        self,
-        dim_in: int,
-        patch_size: int = 14,
-        output_dim: int = 2,
-        activation: str = "exp",
-        conf_activation: str = "expp1",
-        features: int = 256,
-        out_channels: Sequence[int] = (256, 512, 1024, 1024),
-        pos_embed: bool = True,
-        down_ratio: int = 1,
-        aux_pyramid_levels: int = 4,
-        aux_out1_conv_num: int = 5,
-        head_names: Tuple[str, str] = ("depth", "ray"),
-        device=None, dtype=None, operations=None,
-    ):
-        super().__init__()
-        self.patch_size = patch_size
-        self.activation = activation
-        self.conf_activation = conf_activation
-        self.pos_embed = pos_embed
-        self.down_ratio = down_ratio
-        self.aux_levels = aux_pyramid_levels
-        self.aux_out1_conv_num = aux_out1_conv_num
-        self.head_main, self.head_aux = head_names
-        self.intermediate_layer_idx: Tuple[int, int, int, int] = (0, 1, 2, 3)
-        # Toggle the auxiliary ray branch at runtime. Default off (mono path).
-        # ``DepthAnything3Net`` flips this on when running multi-view + ray-pose.
-        self.enable_aux: bool = False
-
-        self.norm = operations.LayerNorm(dim_in, device=device, dtype=dtype)
-        out_channels = list(out_channels)
-        self.projects = nn.ModuleList([
-            operations.Conv2d(dim_in, oc, kernel_size=1, stride=1, padding=0,
-                              device=device, dtype=dtype)
-            for oc in out_channels
-        ])
-        self.resize_layers = nn.ModuleList([
-            operations.ConvTranspose2d(out_channels[0], out_channels[0], kernel_size=4, stride=4, padding=0,
-                                       device=device, dtype=dtype),
-            operations.ConvTranspose2d(out_channels[1], out_channels[1], kernel_size=2, stride=2, padding=0,
-                                       device=device, dtype=dtype),
-            nn.Identity(),
-            operations.Conv2d(out_channels[3], out_channels[3], kernel_size=3, stride=2, padding=1,
-                              device=device, dtype=dtype),
-        ])
-
-        self.scratch = _make_scratch(out_channels, features,
-                                     device=device, dtype=dtype, operations=operations)
-        # Main fusion chain
-        self.scratch.refinenet1 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
-        self.scratch.refinenet2 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
-        self.scratch.refinenet3 = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
-        self.scratch.refinenet4 = _make_fusion_block(features, has_residual=False,
-                                                    device=device, dtype=dtype, operations=operations)
-        # Auxiliary fusion chain (separate copies)
-        self.scratch.refinenet1_aux = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
-        self.scratch.refinenet2_aux = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
-        self.scratch.refinenet3_aux = _make_fusion_block(features, device=device, dtype=dtype, operations=operations)
-        self.scratch.refinenet4_aux = _make_fusion_block(features, has_residual=False,
-                                                        device=device, dtype=dtype, operations=operations)
-
-        head_features_1 = features
-        head_features_2 = 32
-
-        # Main head neck + final projection
-        self.scratch.output_conv1 = operations.Conv2d(
-            head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1,
-            device=device, dtype=dtype,
-        )
-        self.scratch.output_conv2 = nn.Sequential(
-            operations.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1,
-                              device=device, dtype=dtype),
-            nn.ReLU(inplace=False),
-            operations.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0,
-                              device=device, dtype=dtype),
-        )
-
-        # Aux pre-head per level (multi-level pyramid)
-        self.scratch.output_conv1_aux = nn.ModuleList([
-            self._make_aux_out1_block(head_features_1, device=device, dtype=dtype, operations=operations)
-            for _ in range(self.aux_levels)
-        ])
-
-        # Aux final projection per level (includes LayerNorm permute path).
-        ln_seq = [Permute((0, 2, 3, 1)),
-                  operations.LayerNorm(head_features_2, device=device, dtype=dtype),
-                  Permute((0, 3, 1, 2))]
-        self.scratch.output_conv2_aux = nn.ModuleList([
-            nn.Sequential(
-                operations.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1,
-                                  device=device, dtype=dtype),
-                *ln_seq,
-                nn.ReLU(inplace=False),
-                operations.Conv2d(head_features_2, 7, kernel_size=1, stride=1, padding=0,
-                                  device=device, dtype=dtype),
-            )
-            for _ in range(self.aux_levels)
-        ])
-
-    @staticmethod
-    def _make_aux_out1_block(in_ch: int, *, device=None, dtype=None, operations=None) -> nn.Sequential:
-        # aux_out1_conv_num=5 in all Apache-2.0 variants.
-        return nn.Sequential(
-            operations.Conv2d(in_ch, in_ch // 2, 3, 1, 1, device=device, dtype=dtype),
-            operations.Conv2d(in_ch // 2, in_ch, 3, 1, 1, device=device, dtype=dtype),
-            operations.Conv2d(in_ch, in_ch // 2, 3, 1, 1, device=device, dtype=dtype),
-            operations.Conv2d(in_ch // 2, in_ch, 3, 1, 1, device=device, dtype=dtype),
-            operations.Conv2d(in_ch, in_ch // 2, 3, 1, 1, device=device, dtype=dtype),
-        )
-
-    def forward(self, feats: List[torch.Tensor], H: int, W: int,
-                patch_start_idx: int = 0, **_kwargs) -> dict:
-        B, S, N, C = feats[0][0].shape
-        feats_flat = [feat[0].reshape(B * S, N, C) for feat in feats]
-
-        ph, pw = H // self.patch_size, W // self.patch_size
-        resized = []
-        for stage_idx, take_idx in enumerate(self.intermediate_layer_idx):
-            x = feats_flat[take_idx][:, patch_start_idx:]
-            x = self.norm(x)
-            x = x.permute(0, 2, 1).contiguous().reshape(B * S, C, ph, pw)
-            x = self.projects[stage_idx](x)
-            if self.pos_embed:
-                x = _add_pos_embed(x, W, H)
-            x = self.resize_layers[stage_idx](x)
-            resized.append(x)
-
-        l1_rn = self.scratch.layer1_rn(resized[0])
-        l2_rn = self.scratch.layer2_rn(resized[1])
-        l3_rn = self.scratch.layer3_rn(resized[2])
-        l4_rn = self.scratch.layer4_rn(resized[3])
-
-        # Main pyramid (output_conv1 is applied inside the upstream `_fuse`,
-        # before interpolation -- replicate that order here).
-        m = self.scratch.refinenet4(l4_rn, size=l3_rn.shape[2:])
-        if self.enable_aux:
-            a4 = self.scratch.refinenet4_aux(l4_rn, size=l3_rn.shape[2:])
-            aux_pyr = [a4]
-        m = self.scratch.refinenet3(m, l3_rn, size=l2_rn.shape[2:])
-        if self.enable_aux:
-            aux_pyr.append(self.scratch.refinenet3_aux(aux_pyr[-1], l3_rn, size=l2_rn.shape[2:]))
-        m = self.scratch.refinenet2(m, l2_rn, size=l1_rn.shape[2:])
-        if self.enable_aux:
-            aux_pyr.append(self.scratch.refinenet2_aux(aux_pyr[-1], l2_rn, size=l1_rn.shape[2:]))
-        m = self.scratch.refinenet1(m, l1_rn)
-        if self.enable_aux:
-            aux_pyr.append(self.scratch.refinenet1_aux(aux_pyr[-1], l1_rn))
-        m = self.scratch.output_conv1(m)
-
-        h_out = int(ph * self.patch_size / self.down_ratio)
-        w_out = int(pw * self.patch_size / self.down_ratio)
-
-        m = _custom_interpolate(m, (h_out, w_out), mode="bilinear", align_corners=True)
-        if self.pos_embed:
-            m = _add_pos_embed(m, W, H)
-        main_logits = self.scratch.output_conv2(m)
-        fmap = main_logits.permute(0, 2, 3, 1)
-        depth_pred = _apply_activation(fmap[..., :-1], self.activation)
-        depth_conf = _apply_activation(fmap[..., -1], self.conf_activation)
-
-        outs = {
-            self.head_main: depth_pred.squeeze(-1).view(B, S, *depth_pred.shape[1:-1]),
-            f"{self.head_main}_conf": depth_conf.view(B, S, *depth_conf.shape[1:]),
-        }
-
-        if self.enable_aux:
-            # Auxiliary "ray" head (multi-level inside) -- only the last level
-            # is returned. Mirrors upstream ``DualDPT._fuse`` + ``_forward_impl``:
-            # each aux pyramid level goes through ``output_conv1_aux[i]``
-            # (5-layer conv stack that ends at ``features // 2`` channels),
-            # then the last level optionally gets a pos-embed and finally
-            # ``output_conv2_aux[-1]``.
-            aux_processed = [
-                self.scratch.output_conv1_aux[i](a) for i, a in enumerate(aux_pyr)
-            ]
-            last_aux = aux_processed[-1]
-            if self.pos_embed:
-                last_aux = _add_pos_embed(last_aux, W, H)
-            last_aux_logits = self.scratch.output_conv2_aux[-1](last_aux)
-            fmap_last = last_aux_logits.permute(0, 2, 3, 1)
-            # Channels: [ray(6), ray_conf(1)]; ray uses 'linear' activation.
-            aux_pred = fmap_last[..., :-1]
-            aux_conf = _apply_activation(fmap_last[..., -1], self.conf_activation)
-            outs[self.head_aux] = aux_pred.view(B, S, *aux_pred.shape[1:])
-            outs[f"{self.head_aux}_conf"] = aux_conf.view(B, S, *aux_conf.shape[1:])
-
-        return outs
--- a/comfy/ldm/depth_anything_3/model.py
+++ b/comfy/ldm/depth_anything_3/model.py
@ -1,300 +0,0 @@
-# DepthAnything3Net: top-level wrapper that combines backbone + head.
-#
-# Supports both the monocular and the multi-view + camera path:
-#
-# * Monocular: ``S = 1``, no camera encoder/decoder. Mirrors the original
-#   port that only handled ``DA3-MONO/METRIC-LARGE`` and the auxiliary-disabled
-#   ``DA3-SMALL/BASE`` configs.
-# * Multi-view + camera: ``S > 1``. ``cam_enc`` (optional) maps user-supplied
-#   extrinsics + intrinsics into a per-view camera token; ``cam_dec`` decodes
-#   the final layer's camera token into a 9-D pose encoding. When the
-#   auxiliary "ray" head of ``DualDPT`` is enabled the predicted ray map can
-#   alternatively be used to estimate pose via RANSAC (``use_ray_pose=True``).
-#   The 3D-Gaussian head and the nested-architecture wrapper are intentionally
-#   left out of scope here; their state-dict keys are filtered in
-#   ``comfy.supported_models.DepthAnything3.process_unet_state_dict``.
-#
-# The backbone is shared with the CLIP-vision DINOv2 path
-# (``comfy.image_encoders.dino2.Dinov2Model``); the DA3-specific extensions
-# (RoPE, QK-norm, alternating local/global attention, camera token, multi-
-# layer feature extraction, reference-view reordering) are opt-in via the
-# config dict and are all disabled for the Mono/Metric variants.
-
-from __future__ import annotations
-
-from typing import Dict, Optional, Sequence
-
-import torch
-import torch.nn as nn
-
-from comfy.image_encoders.dino2 import Dinov2Model
-
-from .camera import CameraDec, CameraEnc
-from .dpt import DPT, DualDPT
-from .ray_pose import get_extrinsic_from_camray
-from .transform import affine_inverse, pose_encoding_to_extri_intri
-
-
-_HEAD_REGISTRY = {
-    "dpt": DPT,
-    "dualdpt": DualDPT,
-}
-
-
-# Backbone presets (mirror the upstream DINOv2 ViT variants).
-_BACKBONE_PRESETS = {
-    "vits": dict(hidden_size=384,  num_hidden_layers=12, num_attention_heads=6,  use_swiglu_ffn=False),
-    "vitb": dict(hidden_size=768,  num_hidden_layers=12, num_attention_heads=12, use_swiglu_ffn=False),
-    "vitl": dict(hidden_size=1024, num_hidden_layers=24, num_attention_heads=16, use_swiglu_ffn=False),
-    "vitg": dict(hidden_size=1536, num_hidden_layers=40, num_attention_heads=24, use_swiglu_ffn=True),
-}
-
-
-def _build_backbone_config(
-    backbone_name: str,
-    *,
-    alt_start: int,
-    qknorm_start: int,
-    rope_start: int,
-    cat_token: bool,
-) -> dict:
-    if backbone_name not in _BACKBONE_PRESETS:
-        raise ValueError(f"Unknown DINOv2 backbone variant: {backbone_name!r}")
-    cfg = dict(_BACKBONE_PRESETS[backbone_name])
-    cfg.update(dict(
-        layer_norm_eps=1e-6,
-        patch_size=14,
-        image_size=518,
-        # No mask_token in DA3 weights; omit param to avoid load warnings.
-        use_mask_token=False,
-        alt_start=alt_start,
-        qknorm_start=qknorm_start,
-        rope_start=rope_start,
-        cat_token=cat_token,
-        rope_freq=100.0,
-    ))
-    return cfg
-
-
-class DepthAnything3Net(nn.Module):
-    """ComfyUI-side DepthAnything3 network.
-
-    Parameters mirror the variant YAML configs from the upstream repo and
-    are auto-detected from the state dict by ``comfy/model_detection.py``.
-    The kwargs ``device``, ``dtype`` and ``operations`` are injected by
-    ``BaseModel``.
-    """
-
-    PATCH_SIZE = 14
-
-    def __init__(
-        self,
-        # --- Backbone ---
-        backbone_name: str = "vitl",
-        out_layers: Sequence[int] = (4, 11, 17, 23),
-        alt_start: int = -1,
-        qknorm_start: int = -1,
-        rope_start: int = -1,
-        cat_token: bool = False,
-        # --- Head ---
-        head_type: str = "dpt",                # "dpt" or "dualdpt"
-        head_dim_in: int = 1024,
-        head_output_dim: int = 1,              # 1 = depth only, 2 = depth+conf
-        head_features: int = 256,
-        head_out_channels: Sequence[int] = (256, 512, 1024, 1024),
-        head_use_sky_head: bool = True,        # ignored by DualDPT
-        head_pos_embed: Optional[bool] = None, # default: True for DualDPT, False for DPT
-        # --- Camera (multi-view) ---
-        has_cam_enc: bool = False,
-        has_cam_dec: bool = False,
-        cam_dim_out: Optional[int] = None,     # CameraEnc dim_out (defaults to embed_dim)
-        cam_dec_dim_in: Optional[int] = None,  # CameraDec dim_in  (defaults to 2*embed_dim with cat_token)
-        # ComfyUI plumbing
-        device=None, dtype=None, operations=None,
-        **_ignored,
-    ):
-        super().__init__()
-        head_cls = _HEAD_REGISTRY[head_type.lower()]
-        self.head_type = head_type.lower()
-        self.has_sky = (self.head_type == "dpt") and head_use_sky_head
-        self.has_conf = head_output_dim > 1
-        self.out_layers = list(out_layers)
-
-        backbone_cfg = _build_backbone_config(
-            backbone_name,
-            alt_start=alt_start,
-            qknorm_start=qknorm_start,
-            rope_start=rope_start,
-            cat_token=cat_token,
-        )
-        self.backbone = Dinov2Model(backbone_cfg, dtype, device, operations)
-
-        head_kwargs = dict(
-            dim_in=head_dim_in,
-            patch_size=self.PATCH_SIZE,
-            output_dim=head_output_dim,
-            features=head_features,
-            out_channels=tuple(head_out_channels),
-            device=device, dtype=dtype, operations=operations,
-        )
-        if self.head_type == "dpt":
-            head_kwargs.update(
-                use_sky_head=head_use_sky_head,
-                pos_embed=(False if head_pos_embed is None else head_pos_embed),
-            )
-        else:  # dualdpt
-            head_kwargs.update(
-                pos_embed=(True if head_pos_embed is None else head_pos_embed),
-            )
-        self.head = head_cls(**head_kwargs)
-
-        # Built only if checkpoint has weights; cam_enc output dim == embed_dim.
-        embed_dim = backbone_cfg["hidden_size"]
-        if has_cam_enc:
-            self.cam_enc = CameraEnc(
-                dim_out=cam_dim_out if cam_dim_out is not None else embed_dim,
-                num_heads=max(1, embed_dim // 64),
-                device=device, dtype=dtype, operations=operations,
-            )
-        else:
-            self.cam_enc = None
-        if has_cam_dec:
-            default_dim = embed_dim * (2 if cat_token else 1)
-            self.cam_dec = CameraDec(
-                dim_in=cam_dec_dim_in if cam_dec_dim_in is not None else default_dim,
-                device=device, dtype=dtype, operations=operations,
-            )
-        else:
-            self.cam_dec = None
-
-        self.dtype = dtype
-
-    def forward(
-        self,
-        image: torch.Tensor,
-        extrinsics: Optional[torch.Tensor] = None,
-        intrinsics: Optional[torch.Tensor] = None,
-        *,
-        use_ray_pose: bool = False,
-        ref_view_strategy: str = "saddle_balanced",
-        export_feat_layers: Optional[Sequence[int]] = None,
-        **_unused,
-    ) -> Dict[str, torch.Tensor]:
-        """Run depth (and optionally pose) prediction.
-
-        Args:
-            image: ``(B, 3, H, W)`` ImageNet-normalised image tensor, or
-                   ``(B, S, 3, H, W)`` for multi-view inputs. ``H`` and ``W``
-                   must be multiples of 14.
-            extrinsics: optional ``(B, S, 4, 4)`` world-to-camera extrinsics.
-                When provided together with ``intrinsics``, ``CameraEnc``
-                converts them into per-view camera tokens that the backbone
-                injects at block ``alt_start``.
-            intrinsics: optional ``(B, S, 3, 3)`` pixel-space intrinsics.
-            use_ray_pose: if True, predict pose from the auxiliary "ray" head
-                (RANSAC over per-pixel rays). Only available on DualDPT
-                variants. If False (default) and ``cam_dec`` is present,
-                the final-layer cam token is decoded into pose instead.
-            ref_view_strategy: reference-view selection strategy used when
-                ``S >= 3`` and no extrinsics are supplied. See
-                :mod:`comfy.ldm.depth_anything_3.reference_view_selector`.
-            export_feat_layers: optional list of backbone layer indices whose
-                local features to also return as auxiliary outputs (used by
-                downstream nested-architecture wrappers; empty by default).
-
-        Returns:
-            Dict with a subset of:
-              - ``depth``       ``(B*S, H, W)`` raw depth values.
-              - ``depth_conf``  ``(B*S, H, W)`` confidence (DualDPT only).
-              - ``sky``         ``(B*S, H, W)`` sky probability (DPT + sky head).
-              - ``ray``         ``(B, S, h, w, 6)`` per-pixel cam ray (DualDPT,
-                                 multi-view, ``use_ray_pose=True`` only).
-              - ``ray_conf``    ``(B, S, h, w)`` ray confidence.
-              - ``extrinsics``  ``(B, S, 4, 4)`` world-to-cam, when pose
-                                 prediction is active.
-              - ``intrinsics``  ``(B, S, 3, 3)`` pixel-space intrinsics.
-              - ``aux_features`` list of ``(B, S, h_p, w_p, C)`` features
-                                 when ``export_feat_layers`` is non-empty.
-        """
-        if image.ndim == 4:
-            image = image.unsqueeze(1)  # (B, 1, 3, H, W)
-        assert image.ndim == 5 and image.shape[2] == 3, \
-            f"image must be (B,3,H,W) or (B,S,3,H,W); got {tuple(image.shape)}"
-
-        B, S, _, H, W = image.shape
-        assert H % self.PATCH_SIZE == 0 and W % self.PATCH_SIZE == 0, \
-            f"image H,W must be multiples of {self.PATCH_SIZE}; got {(H, W)}"
-
-        # Camera-token preparation (multi-view path).
-        cam_token = None
-        if extrinsics is not None and intrinsics is not None and self.cam_enc is not None:
-            cam_token = self.cam_enc(extrinsics, intrinsics, (H, W))
-
-        # Toggle aux ray output on/off depending on what the caller asked for.
-        if isinstance(self.head, DualDPT):
-            self.head.enable_aux = bool(use_ray_pose)
-
-        feats, aux_feats = self.backbone.get_intermediate_layers_da3(
-            image, self.out_layers, cam_token=cam_token,
-            ref_view_strategy=ref_view_strategy,
-            export_feat_layers=export_feat_layers,
-        )
-        head_out = self.head(feats, H=H, W=W, patch_start_idx=0)
-
-        # Pose prediction.
-        out: Dict[str, torch.Tensor] = {}
-        if use_ray_pose and "ray" in head_out and "ray_conf" in head_out:
-            ray = head_out["ray"]
-            ray_conf = head_out["ray_conf"]
-            extr_c2w, focal, pp = get_extrinsic_from_camray(
-                ray, ray_conf, ray.shape[-3], ray.shape[-2],
-            )
-            # Match the upstream output: w2c, drop the homogeneous row.
-            extr_w2c = affine_inverse(extr_c2w)[:, :, :3, :]
-            # Build pixel-space intrinsics from the normalised focal/pp output.
-            intr = torch.eye(3, device=ray.device, dtype=ray.dtype)
-            intr = intr[None, None].expand(extr_c2w.shape[0], extr_c2w.shape[1], 3, 3).clone()
-            intr[:, :, 0, 0] = focal[:, :, 0] / 2 * W
-            intr[:, :, 1, 1] = focal[:, :, 1] / 2 * H
-            intr[:, :, 0, 2] = pp[:, :, 0] * W * 0.5
-            intr[:, :, 1, 2] = pp[:, :, 1] * H * 0.5
-            out["extrinsics"] = extr_w2c
-            out["intrinsics"] = intr
-        elif self.cam_dec is not None and S > 1:
-            # Decode the cam-token of the final out_layer into a pose encoding.
-            cam_feat = feats[-1][1]  # (B, S, dim_in_to_cam_dec)
-            pose_enc = self.cam_dec(cam_feat)
-            c2w_3x4, intr = pose_encoding_to_extri_intri(pose_enc, (H, W))
-            # Match the upstream output convention: w2c (world->camera), 3x4.
-            c2w_4x4 = torch.cat([
-                c2w_3x4,
-                torch.tensor([0, 0, 0, 1], device=c2w_3x4.device, dtype=c2w_3x4.dtype)
-                    .view(1, 1, 1, 4).expand(B, S, 1, 4),
-            ], dim=-2)
-            out["extrinsics"] = affine_inverse(c2w_4x4)[:, :, :3, :]
-            out["intrinsics"] = intr
-
-        # Flatten the views axis for per-pixel outputs (depth/conf/sky) so the
-        # per-image consumer keeps its (B*S, H, W) interface.
-        for k, v in head_out.items():
-            if k in ("ray", "ray_conf"):
-                # Keep multi-view shape for downstream pose work.
-                out[k] = v
-            elif v.ndim >= 3 and v.shape[0] == B and v.shape[1] == S:
-                out[k] = v.reshape(B * S, *v.shape[2:])
-            else:
-                out[k] = v
-
-        if export_feat_layers:
-            out["aux_features"] = self._reshape_aux_features(aux_feats, H, W)
-        return out
-
-    def _reshape_aux_features(self, aux_feats, H: int, W: int):
-        """Reshape ``(B, S, N, C)`` aux features into ``(B, S, h_p, w_p, C)``."""
-        ph, pw = H // self.PATCH_SIZE, W // self.PATCH_SIZE
-        out = []
-        for f in aux_feats:
-            B, S, N, C = f.shape
-            assert N == ph * pw, f"aux feature seq mismatch: {N} != {ph}*{pw}"
-            out.append(f.reshape(B, S, ph, pw, C))
-        return out
--- a/comfy/ldm/depth_anything_3/preprocess.py
+++ b/comfy/ldm/depth_anything_3/preprocess.py
@ -1,184 +0,0 @@
-# Input/output preprocessing helpers for Depth Anything 3.
-#
-# Ported from:
-#   src/depth_anything_3/utils/io/input_processor.py    (image normalisation)
-#   src/depth_anything_3/utils/alignment.py             (sky-aware depth clip)
-#   src/depth_anything_3/model/da3.py::_process_mono_sky_estimation
-#
-# Resize: ``comfy.utils.common_upscale`` with ``upscale_method="lanczos"``.
-# Upstream uses cv2 INTER_CUBIC (upscale) / INTER_AREA (downscale); a sweep
-# across {bilinear, bicubic, area, lanczos, bislerp} on a 768->504 test image
-# showed lanczos has the lowest max-abs-diff vs the upstream cv2 output
-# (~0.13 vs 0.21-0.71 for the others), so we use it in both directions for
-# simplicity. This keeps the path stateless, on-device, and free of any
-# OpenCV dependency.
-
-from __future__ import annotations
-
-from typing import Tuple
-
-import torch
-
-import comfy.utils
-
-PATCH_SIZE = 14
-
-# ImageNet normalization constants used during DA3 training.
-_IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406])
-_IMAGENET_STD = torch.tensor([0.229, 0.224, 0.225])
-
-
-def _round_to_patch(x: int, patch: int = PATCH_SIZE) -> int:
-    down = (x // patch) * patch
-    up = down + patch
-    return up if abs(up - x) <= abs(x - down) else down
-
-
-def compute_target_size(orig_h: int, orig_w: int, process_res: int,
-                        method: str = "upper_bound_resize") -> Tuple[int, int]:
-    """Compute (target_h, target_w) for a single image.
-
-    Methods:
-      - "upper_bound_resize": scale longest side to ``process_res``, then
-        round each dim to nearest multiple of 14 (default upstream method).
-      - "lower_bound_resize": scale shortest side to ``process_res``, then
-        round.
-    """
-    if method == "upper_bound_resize":
-        longest = max(orig_h, orig_w)
-        scale = process_res / float(longest)
-    elif method == "lower_bound_resize":
-        shortest = min(orig_h, orig_w)
-        scale = process_res / float(shortest)
-    else:
-        raise ValueError(f"Unsupported process_res_method: {method}")
-
-    new_w = max(1, _round_to_patch(int(round(orig_w * scale))))
-    new_h = max(1, _round_to_patch(int(round(orig_h * scale))))
-    return new_h, new_w
-
-
-def preprocess_image(
-    image: torch.Tensor,
-    process_res: int = 504,
-    method: str = "upper_bound_resize",
-) -> torch.Tensor:
-    """Preprocess a ComfyUI ``IMAGE`` batch for DA3.
-
-    Args:
-        image: ``(B, H, W, 3)`` float in [0, 1] (ComfyUI ``IMAGE`` convention).
-        process_res: target resolution (longest or shortest side, depending
-            on ``method``).
-        method: resize strategy.
-
-    Returns:
-        ``(B, 3, H', W')`` tensor with H' and W' multiples of 14, normalised
-        with ImageNet statistics. The tensor lives on the same device as
-        ``image``.
-    """
-    assert image.ndim == 4 and image.shape[-1] == 3, \
-        f"expected (B,H,W,3) IMAGE; got {tuple(image.shape)}"
-    B, H, W, _ = image.shape
-    target_h, target_w = compute_target_size(H, W, process_res, method)
-
-    # (B, H, W, 3) -> (B, 3, H, W)
-    x = image.movedim(-1, 1).contiguous()
-    if (target_h, target_w) != (H, W):
-        # Upstream uses cv2 INTER_CUBIC (upscale) / INTER_AREA (downscale).
-        # Lanczos in ``common_upscale`` is anti-aliased and produces the
-        # closest pixel-wise match in a sweep across {bilinear, bicubic,
-        # area, lanczos, bislerp}. Used in both directions for simplicity.
-        x = comfy.utils.common_upscale(
-            x.float(), target_w, target_h, "lanczos", "disabled",
-        )
-    x = x.clamp(0.0, 1.0)
-
-    mean = _IMAGENET_MEAN.to(device=x.device, dtype=x.dtype).view(1, 3, 1, 1)
-    std = _IMAGENET_STD.to(device=x.device, dtype=x.dtype).view(1, 3, 1, 1)
-    x = (x - mean) / std
-    return x
-
-
-# -----------------------------------------------------------------------------
-# Output post-processing (sky-aware clipping for Mono/Metric variants)
-# -----------------------------------------------------------------------------
-
-
-def compute_non_sky_mask(sky_prediction: torch.Tensor, threshold: float = 0.3) -> torch.Tensor:
-    """Boolean mask: True for non-sky pixels (sky probability < threshold)."""
-    return sky_prediction < threshold
-
-
-def apply_sky_aware_clip(
-    depth: torch.Tensor,
-    sky: torch.Tensor,
-    threshold: float = 0.3,
-    quantile: float = 0.99,
-) -> torch.Tensor:
-    """Replicates ``_process_mono_sky_estimation`` from upstream.
-
-    Clips sky regions to the 99th percentile of non-sky depth. Returns a new
-    depth tensor; ``depth`` is not modified in place.
-    """
-    non_sky = compute_non_sky_mask(sky, threshold=threshold)
-    if non_sky.sum() <= 10 or (~non_sky).sum() <= 10:
-        return depth.clone()
-
-    non_sky_depth = depth[non_sky]
-    if non_sky_depth.numel() > 100_000:
-        idx = torch.randint(0, non_sky_depth.numel(), (100_000,), device=non_sky_depth.device)
-        sampled = non_sky_depth[idx]
-    else:
-        sampled = non_sky_depth
-
-    max_depth = torch.quantile(sampled, quantile)
-    out = depth.clone()
-    out[~non_sky] = max_depth
-    return out
-
-
-def normalize_depth_v2_style(
-    depth: torch.Tensor,
-    sky: torch.Tensor | None = None,
-    low_quantile: float = 0.01,
-    high_quantile: float = 0.99,
-) -> torch.Tensor:
-    """V2-style normalization for ControlNet workflows.
-
-    Computes percentile bounds over non-sky pixels (when available),
-    then maps depth into [0, 1] with near = white (1.0).
-    """
-    if sky is not None:
-        mask = compute_non_sky_mask(sky)
-        if mask.any():
-            valid = depth[mask]
-        else:
-            valid = depth.flatten()
-    else:
-        valid = depth.flatten()
-
-    if valid.numel() > 100_000:
-        idx = torch.randint(0, valid.numel(), (100_000,), device=valid.device)
-        sample = valid[idx]
-    else:
-        sample = valid
-
-    lo = torch.quantile(sample, low_quantile)
-    hi = torch.quantile(sample, high_quantile)
-    rng = (hi - lo).clamp(min=1e-6)
-    norm = ((depth - lo) / rng).clamp(0.0, 1.0)
-    # ControlNet convention: nearer pixels are brighter (1.0).
-    norm = 1.0 - norm
-    if sky is not None:
-        # Sky pixels become black (far / unknown).
-        sky_mask = ~compute_non_sky_mask(sky)
-        norm = torch.where(sky_mask, torch.zeros_like(norm), norm)
-    return norm
-
-
-def normalize_depth_min_max(depth: torch.Tensor) -> torch.Tensor:
-    """Simple per-frame min/max normalization with near=1.0 convention."""
-    lo = depth.amin(dim=(-2, -1), keepdim=True)
-    hi = depth.amax(dim=(-2, -1), keepdim=True)
-    rng = (hi - lo).clamp(min=1e-6)
-    return 1.0 - ((depth - lo) / rng).clamp(0.0, 1.0)
--- a/comfy/ldm/depth_anything_3/ray_pose.py
+++ b/comfy/ldm/depth_anything_3/ray_pose.py
@ -1,318 +0,0 @@
-"""Ray-to-pose conversion for the multi-view path of Depth Anything 3.
-
-Converts the auxiliary "ray" output of :class:`DualDPT` (per-pixel camera
-ray vectors, predicted on the per-view local feature map) into per-view
-extrinsics + intrinsics. Implementation is a 1:1 port of
-``depth_anything_3.utils.ray_utils`` upstream, using a weighted-RANSAC
-homography fit followed by a QL decomposition.
-
-No learned parameters; pure tensor math. Output:
-
-* ``R``       -- ``(B, S, 3, 3)`` rotation matrix
-* ``T``       -- ``(B, S, 3)``   camera-space translation
-* ``focal_lengths``     -- ``(B, S, 2)``  in normalised image space (image=2x2)
-* ``principal_points``  -- ``(B, S, 2)``  ditto
-
-:func:`get_extrinsic_from_camray` wraps these into a 4x4 extrinsic matrix
-that the public node converts back into pixel-space intrinsics.
-"""
-
-from __future__ import annotations
-
-from typing import Optional, Tuple
-
-import torch
-
-
-# qr/svd use fp32: CUDA often has no fp16/bf16 kernels for these ops.
-
-
-def _ql_decomposition(A: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    """Decompose ``A = Q @ L`` with ``Q`` orthogonal and ``L`` lower-triangular.
-
-    Implemented in terms of QR by reversing the columns/rows; the standard
-    trick from the upstream reference. Inputs ``A`` are ``(3, 3)``.
-    """
-    P = torch.tensor([[0, 0, 1], [0, 1, 0], [1, 0, 0]],
-                     device=A.device, dtype=A.dtype)
-    A_tilde = A @ P
-    # CUDA QR is not implemented for fp16/bf16; upcast just for this call.
-    Q_tilde, R_tilde = torch.linalg.qr(A_tilde.float())
-    Q_tilde = Q_tilde.to(A.dtype)
-    R_tilde = R_tilde.to(A.dtype)
-    Q = Q_tilde @ P
-    L = P @ R_tilde @ P
-    d = torch.diag(L)
-    sign = torch.sign(d)
-    Q = Q * sign[None, :]            # scale columns of Q
-    L = L * sign[:, None]             # scale rows of L
-    return Q, L
-
-
-def _homogenize_points(points: torch.Tensor) -> torch.Tensor:
-    return torch.cat([points, torch.ones_like(points[..., :1])], dim=-1)
-
-
-# -----------------------------------------------------------------------------
-# Weighted-LSQ + RANSAC homography (batched)
-# -----------------------------------------------------------------------------
-
-
-def _find_homography_weighted_lsq(
-    src_pts: torch.Tensor,
-    dst_pts: torch.Tensor,
-    confident_weight: torch.Tensor,
-) -> torch.Tensor:
-    """Solve a single ``H`` with weighted least-squares (DLT)."""
-    N = src_pts.shape[0]
-    if N < 4:
-        raise ValueError("At least 4 points are required to compute a homography.")
-    w = confident_weight.sqrt().unsqueeze(1)  # (N, 1)
-    x = src_pts[:, 0:1]
-    y = src_pts[:, 1:2]
-    u = dst_pts[:, 0:1]
-    v = dst_pts[:, 1:2]
-    zeros = torch.zeros_like(x)
-    A1 = torch.cat([-x * w, -y * w, -w, zeros, zeros, zeros, x * u * w, y * u * w, u * w], dim=1)
-    A2 = torch.cat([zeros, zeros, zeros, -x * w, -y * w, -w, x * v * w, y * v * w, v * w], dim=1)
-    A = torch.cat([A1, A2], dim=0)        # (2N, 9)
-    # CUDA SVD is not implemented for fp16/bf16; upcast just for this call.
-    _, _, Vh = torch.linalg.svd(A.float())
-    Vh = Vh.to(A.dtype)
-    H = Vh[-1].reshape(3, 3)
-    return H / H[-1, -1]
-
-
-def _find_homography_weighted_lsq_batched(
-    src_pts_batch: torch.Tensor,
-    dst_pts_batch: torch.Tensor,
-    confident_weight_batch: torch.Tensor,
-) -> torch.Tensor:
-    """Batched DLT solver. Inputs ``(B, K, 2)`` / ``(B, K)``; output ``(B, 3, 3)``."""
-    B, K, _ = src_pts_batch.shape
-    w = confident_weight_batch.sqrt().unsqueeze(2)
-    x = src_pts_batch[:, :, 0:1]
-    y = src_pts_batch[:, :, 1:2]
-    u = dst_pts_batch[:, :, 0:1]
-    v = dst_pts_batch[:, :, 1:2]
-    zeros = torch.zeros_like(x)
-    A1 = torch.cat([-x * w, -y * w, -w, zeros, zeros, zeros, x * u * w, y * u * w, u * w], dim=2)
-    A2 = torch.cat([zeros, zeros, zeros, -x * w, -y * w, -w, x * v * w, y * v * w, v * w], dim=2)
-    A = torch.cat([A1, A2], dim=1)        # (B, 2K, 9)
-    # CUDA SVD is not implemented for fp16/bf16; upcast just for this call.
-    _, _, Vh = torch.linalg.svd(A.float())
-    Vh = Vh.to(A.dtype)
-    H = Vh[:, -1].reshape(B, 3, 3)
-    return H / H[:, 2:3, 2:3]
-
-
-def _ransac_find_homography_weighted_batched(
-    src_pts: torch.Tensor,                # (B, N, 2)
-    dst_pts: torch.Tensor,                # (B, N, 2)
-    confident_weight: torch.Tensor,       # (B, N)
-    n_sample: int,
-    n_iter: int = 100,
-    reproj_threshold: float = 3.0,
-    num_sample_for_ransac: int = 8,
-    max_inlier_num: int = 10000,
-    rand_sample_iters_idx: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    """Batched weighted-RANSAC homography estimator.
-
-    Returns ``(B, 3, 3)`` homography matrices.
-    """
-    B, N, _ = src_pts.shape
-    assert N >= 4
-    device = src_pts.device
-
-    sorted_idx = torch.argsort(confident_weight, descending=True, dim=1)
-    candidate_idx = sorted_idx[:, :n_sample]                  # (B, n_sample)
-
-    if rand_sample_iters_idx is None:
-        rand_sample_iters_idx = torch.stack(
-            [torch.randperm(n_sample, device=device)[:num_sample_for_ransac]
-             for _ in range(n_iter)],
-            dim=0,
-        )
-
-    rand_idx = candidate_idx[:, rand_sample_iters_idx]        # (B, n_iter, k)
-    b_idx = (
-        torch.arange(B, device=device)
-        .view(B, 1, 1)
-        .expand(B, n_iter, num_sample_for_ransac)
-    )
-    src_b = src_pts[b_idx, rand_idx]
-    dst_b = dst_pts[b_idx, rand_idx]
-    w_b = confident_weight[b_idx, rand_idx]
-
-    cB, cN = src_b.shape[:2]
-    H_batch = _find_homography_weighted_lsq_batched(
-        src_b.flatten(0, 1), dst_b.flatten(0, 1), w_b.flatten(0, 1),
-    ).unflatten(0, (cB, cN))                                  # (B, n_iter, 3, 3)
-
-    src_homo = torch.cat([src_pts, torch.ones(B, N, 1, device=device, dtype=src_pts.dtype)], dim=2)
-    proj = torch.bmm(
-        src_homo.unsqueeze(1).expand(B, n_iter, N, 3).reshape(-1, N, 3),
-        H_batch.reshape(-1, 3, 3).transpose(1, 2),
-    )                                                          # (B*n_iter, N, 3)
-    proj_xy = (proj[:, :, :2] / proj[:, :, 2:3]).reshape(B, n_iter, N, 2)
-    err = ((proj_xy - dst_pts.unsqueeze(1)) ** 2).sum(-1).sqrt()  # (B, n_iter, N)
-    inlier_mask = err < reproj_threshold
-    score = (inlier_mask * confident_weight.unsqueeze(1)).sum(dim=2)
-    best_idx = torch.argmax(score, dim=1)
-    best_inlier_mask = inlier_mask[torch.arange(B, device=device), best_idx]
-
-    # Refit with the inlier set (per-batch, since the inlier counts vary).
-    H_inlier_list = []
-    for b in range(B):
-        mask = best_inlier_mask[b]
-        in_src = src_pts[b][mask]
-        in_dst = dst_pts[b][mask]
-        in_w = confident_weight[b][mask]
-        if in_src.shape[0] < 4:
-            # Fall back to identity when RANSAC fails to find enough inliers.
-            H_inlier_list.append(torch.eye(3, device=device, dtype=src_pts.dtype))
-            continue
-        sorted_w = torch.argsort(in_w, descending=True)
-        if len(sorted_w) > max_inlier_num:
-            keep = max(int(len(sorted_w) * 0.95), max_inlier_num)
-            sorted_w = sorted_w[:keep][torch.randperm(keep, device=device)[:max_inlier_num]]
-        H_inlier_list.append(
-            _find_homography_weighted_lsq(in_src[sorted_w], in_dst[sorted_w], in_w[sorted_w])
-        )
-    return torch.stack(H_inlier_list, dim=0)
-
-
-# -----------------------------------------------------------------------------
-# Camera-ray utilities
-# -----------------------------------------------------------------------------
-
-
-def _unproject_identity(num_y: int, num_x: int, B: int, S: int,
-                        device, dtype) -> torch.Tensor:
-    """Camera-space unit rays for an identity intrinsic on a 2x2 image plane.
-
-    Replicates ``unproject_depth(..., ixt_normalized=True)`` upstream: pixel
-    coords ``(x, y)`` in ``[dx, 2-dx] x [dy, 2-dy]`` get mapped to
-    camera-space rays ``(x-1, y-1, 1)`` via the identity intrinsic
-    ``[[1,0,1],[0,1,1],[0,0,1]]``. Returns ``(B, S, num_y, num_x, 3)``.
-    """
-    dx = 1.0 / num_x
-    dy = 1.0 / num_y
-    # Centered camera-space coords directly (skip the K^-1 step since it's
-    # just a translation by -1 on x and y when K is identity-with-center=1).
-    y = torch.linspace(-(1 - dy), (1 - dy), num_y, device=device, dtype=dtype)
-    x = torch.linspace(-(1 - dx), (1 - dx), num_x, device=device, dtype=dtype)
-    yy, xx = torch.meshgrid(y, x, indexing="ij")
-    grid = torch.stack((xx, yy), dim=-1)            # (h, w, 2)
-    grid = grid.unsqueeze(0).unsqueeze(0).expand(B, S, num_y, num_x, 2)
-    return torch.cat([grid, torch.ones_like(grid[..., :1])], dim=-1)
-
-
-def _camray_to_caminfo(
-    camray: torch.Tensor,                   # (B, S, h, w, 6)
-    confidence: Optional[torch.Tensor] = None,  # (B, S, h, w)
-    reproj_threshold: float = 0.2,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Convert per-pixel camera rays to per-view (R, T, focal, principal)."""
-    if confidence is None:
-        confidence = torch.ones_like(camray[..., 0])
-    B, S, h, w, _ = camray.shape
-    device = camray.device
-    dtype = camray.dtype
-
-    rays_target = camray[..., :3]                           # (B, S, h, w, 3)
-    rays_origin = _unproject_identity(h, w, B, S, device, dtype)
-
-    # Flatten (B*S, h*w, *) for the RANSAC routine.
-    rays_target = rays_target.flatten(0, 1).flatten(1, 2)
-    rays_origin = rays_origin.flatten(0, 1).flatten(1, 2)
-    weights = confidence.flatten(0, 1).flatten(1, 2).clone()
-
-    # Project to 2D in homogeneous form (the upstream calls this "perspective division").
-    z_thresh = 1e-4
-    mask = (rays_target[:, :, 2].abs() > z_thresh) & (rays_origin[:, :, 2].abs() > z_thresh)
-    weights = torch.where(mask, weights, torch.zeros_like(weights))
-    src = rays_origin.clone()
-    dst = rays_target.clone()
-    src[..., 0] = torch.where(mask, src[..., 0] / src[..., 2], src[..., 0])
-    src[..., 1] = torch.where(mask, src[..., 1] / src[..., 2], src[..., 1])
-    dst[..., 0] = torch.where(mask, dst[..., 0] / dst[..., 2], dst[..., 0])
-    dst[..., 1] = torch.where(mask, dst[..., 1] / dst[..., 2], dst[..., 1])
-    src = src[..., :2]
-    dst = dst[..., :2]
-
-    N = src.shape[1]
-    n_iter = 100
-    sample_ratio = 0.3
-    num_sample_for_ransac = 8
-    n_sample = max(num_sample_for_ransac, int(N * sample_ratio))
-    rand_idx = torch.stack(
-        [torch.randperm(n_sample, device=device)[:num_sample_for_ransac] for _ in range(n_iter)],
-        dim=0,
-    )
-
-    # Chunk along the view axis to keep peak memory predictable.
-    chunk = 2
-    A_list = []
-    for i in range(0, src.shape[0], chunk):
-        A = _ransac_find_homography_weighted_batched(
-            src[i:i + chunk], dst[i:i + chunk], weights[i:i + chunk],
-            n_sample=n_sample, n_iter=n_iter,
-            num_sample_for_ransac=num_sample_for_ransac,
-            reproj_threshold=reproj_threshold,
-            rand_sample_iters_idx=rand_idx,
-            max_inlier_num=8000,
-        )
-        # Flip sign on dets that come out < 0 (so that the QL produces a
-        # right-handed rotation). ``det`` lacks fp16/bf16 CUDA kernels, so
-        # do the comparison in fp32.
-        flip = torch.linalg.det(A.float()) < 0
-        A = torch.where(flip[:, None, None], -A, A)
-        A_list.append(A)
-    A = torch.cat(A_list, dim=0)                            # (B*S, 3, 3)
-
-    R_list, f_list, pp_list = [], [], []
-    for i in range(A.shape[0]):
-        R, L = _ql_decomposition(A[i])
-        L = L / L[2][2]
-        f_list.append(torch.stack((L[0][0], L[1][1])))
-        pp_list.append(torch.stack((L[2][0], L[2][1])))
-        R_list.append(R)
-    R = torch.stack(R_list).reshape(B, S, 3, 3)
-    focal = torch.stack(f_list).reshape(B, S, 2)
-    pp = torch.stack(pp_list).reshape(B, S, 2)
-
-    # Translation: confidence-weighted average of camray direction(s).
-    cf = confidence.flatten(0, 1).flatten(1, 2)
-    T = (camray.flatten(0, 1).flatten(1, 2)[..., 3:] * cf.unsqueeze(-1)).sum(dim=1)
-    T = T / cf.sum(dim=-1, keepdim=True)
-    T = T.reshape(B, S, 3)
-
-    # Match upstream output convention: focal -> 1/focal, pp + 1.
-    return R, T, 1.0 / focal, pp + 1.0
-
-
-def get_extrinsic_from_camray(
-    camray: torch.Tensor,                       # (B, S, h, w, 6)
-    conf: torch.Tensor,                          # (B, S, h, w, 1) or (B, S, h, w)
-    patch_size_y: int,
-    patch_size_x: int,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Wrap a 4x4 extrinsic + per-view focal + principal-point output.
-
-    Returns:
-      * extrinsic   ``(B, S, 4, 4)``   camera-to-world (the inverse is
-                                       what gets stored in ``output.extrinsics``
-                                       by the caller).
-      * focals      ``(B, S, 2)``       in normalised image space.
-      * pp          ``(B, S, 2)``       in normalised image space.
-    """
-    if conf.ndim == 5 and conf.shape[-1] == 1:
-        conf = conf.squeeze(-1)
-    R, T, focal, pp = _camray_to_caminfo(camray, confidence=conf)
-    extr = torch.cat([R, T.unsqueeze(-1)], dim=-1)           # (B, S, 3, 4)
-    homo_row = torch.tensor([0, 0, 0, 1], dtype=R.dtype, device=R.device)
-    homo_row = homo_row.view(1, 1, 1, 4).expand(R.shape[0], R.shape[1], 1, 4)
-    extr = torch.cat([extr, homo_row], dim=-2)               # (B, S, 4, 4)
-    return extr, focal, pp
--- a/comfy/ldm/depth_anything_3/reference_view_selector.py
+++ b/comfy/ldm/depth_anything_3/reference_view_selector.py
@ -1,116 +0,0 @@
-"""Reference-view selection for the multi-view path of Depth Anything 3.
-
-Pure tensor math, no learned parameters. Exposed as three free functions:
-
-* :func:`select_reference_view`   -- pick a reference view per batch.
-* :func:`reorder_by_reference`    -- move the reference view to position 0.
-* :func:`restore_original_order`  -- inverse of :func:`reorder_by_reference`.
-
-Mirrors ``depth_anything_3.model.reference_view_selector`` upstream.
-The default strategy (``"saddle_balanced"``) selects the view whose CLS
-token features are closest to the median across multiple metrics.
-"""
-
-from __future__ import annotations
-
-from typing import Literal
-
-import torch
-
-
-RefViewStrategy = Literal["first", "middle", "saddle_balanced", "saddle_sim_range"]
-
-
-# Per the upstream constants module: ``THRESH_FOR_REF_SELECTION = 3``.
-# Reference selection only runs when there are at least this many views.
-THRESH_FOR_REF_SELECTION: int = 3
-
-
-def select_reference_view(
-    x: torch.Tensor,
-    strategy: RefViewStrategy = "saddle_balanced",
-) -> torch.Tensor:
-    """Pick a reference view index per batch element.
-
-    Args:
-        x: ``(B, S, N, C)`` token tensor. Index 0 along ``N`` is the
-            cls/cam token used by the feature-based strategies.
-        strategy: One of ``"first" | "middle" | "saddle_balanced" |
-            "saddle_sim_range"``.
-
-    Returns:
-        ``(B,)`` long tensor with the chosen reference view index for
-        each batch element.
-    """
-    B, S, _, _ = x.shape
-    if S <= 1:
-        return torch.zeros(B, dtype=torch.long, device=x.device)
-    if strategy == "first":
-        return torch.zeros(B, dtype=torch.long, device=x.device)
-    if strategy == "middle":
-        return torch.full((B,), S // 2, dtype=torch.long, device=x.device)
-
-    # Feature-based strategies: normalised cls/cam token per view.
-    img_class_feat = x[:, :, 0] / x[:, :, 0].norm(dim=-1, keepdim=True)  # (B,S,C)
-
-    if strategy == "saddle_balanced":
-        sim = torch.matmul(img_class_feat, img_class_feat.transpose(1, 2))  # (B,S,S)
-        sim_no_diag = sim - torch.eye(S, device=sim.device).unsqueeze(0)
-        sim_score = sim_no_diag.sum(dim=-1) / (S - 1)               # (B,S)
-        feat_norm = x[:, :, 0].norm(dim=-1)                          # (B,S)
-        feat_var = img_class_feat.var(dim=-1)                        # (B,S)
-
-        def _normalize(metric):
-            mn = metric.min(dim=1, keepdim=True).values
-            mx = metric.max(dim=1, keepdim=True).values
-            return (metric - mn) / (mx - mn + 1e-8)
-
-        sim_n, norm_n, var_n = _normalize(sim_score), _normalize(feat_norm), _normalize(feat_var)
-        balance = (sim_n - 0.5).abs() + (norm_n - 0.5).abs() + (var_n - 0.5).abs()
-        return balance.argmin(dim=1)
-
-    if strategy == "saddle_sim_range":
-        sim = torch.matmul(img_class_feat, img_class_feat.transpose(1, 2))
-        sim_no_diag = sim - torch.eye(S, device=sim.device).unsqueeze(0)
-        sim_max = sim_no_diag.max(dim=-1).values
-        sim_min = sim_no_diag.min(dim=-1).values
-        return (sim_max - sim_min).argmax(dim=1)
-
-    raise ValueError(
-        f"Unknown reference view selection strategy: {strategy!r}. "
-        f"Must be one of: 'first', 'middle', 'saddle_balanced', 'saddle_sim_range'"
-    )
-
-
-def reorder_by_reference(x: torch.Tensor, b_idx: torch.Tensor) -> torch.Tensor:
-    """Reorder ``x`` so the reference view is at position 0 in axis ``S``."""
-    B, S = x.shape[0], x.shape[1]
-    if S <= 1:
-        return x
-    positions = torch.arange(S, device=x.device).unsqueeze(0).expand(B, -1)
-    b_idx_exp = b_idx.unsqueeze(1)
-    reorder = torch.where(
-        (positions > 0) & (positions <= b_idx_exp),
-        positions - 1,
-        positions,
-    )
-    reorder[:, 0] = b_idx
-    batch = torch.arange(B, device=x.device).unsqueeze(1)
-    return x[batch, reorder]
-
-
-def restore_original_order(x: torch.Tensor, b_idx: torch.Tensor) -> torch.Tensor:
-    """Inverse of :func:`reorder_by_reference`."""
-    B, S = x.shape[0], x.shape[1]
-    if S <= 1:
-        return x
-    target_positions = torch.arange(S, device=x.device).unsqueeze(0).expand(B, -1)
-    b_idx_exp = b_idx.unsqueeze(1)
-    restore = torch.where(target_positions < b_idx_exp,
-                          target_positions + 1,
-                          target_positions)
-    restore = torch.scatter(
-        restore, dim=1, index=b_idx_exp, src=torch.zeros_like(b_idx_exp),
-    )
-    batch = torch.arange(B, device=x.device).unsqueeze(1)
-    return x[batch, restore]
--- a/comfy/ldm/depth_anything_3/transform.py
+++ b/comfy/ldm/depth_anything_3/transform.py
@ -1,180 +0,0 @@
-"""Geometry / camera transform helpers for Depth Anything 3.
-
-Pure tensor math, no learned parameters. Mirrors the upstream upstream
-``depth_anything_3.model.utils.transform`` and the parts of
-``depth_anything_3.utils.geometry`` used at inference time on the
-multi-view + camera path. Kept self-contained so the DA3 module is fully
-ported and does not depend on the upstream repo at runtime.
-"""
-
-from __future__ import annotations
-
-from typing import Tuple
-
-import torch
-import torch.nn.functional as F
-
-
-# -----------------------------------------------------------------------------
-# Affine 4x4 helpers
-# -----------------------------------------------------------------------------
-
-
-def as_homogeneous(ext: torch.Tensor) -> torch.Tensor:
-    """Promote ``(...,3,4)`` extrinsics to ``(...,4,4)`` homogeneous form.
-
-    A no-op when the input is already ``(...,4,4)``.
-    """
-    if ext.shape[-2:] == (4, 4):
-        return ext
-    if ext.shape[-2:] == (3, 4):
-        ones = torch.zeros_like(ext[..., :1, :4])
-        ones[..., 0, 3] = 1.0
-        return torch.cat([ext, ones], dim=-2)
-    raise ValueError(f"Invalid affine shape: {ext.shape}")
-
-
-def affine_inverse(A: torch.Tensor) -> torch.Tensor:
-    """Inverse of an affine matrix ``[R|T; 0 0 0 1]``."""
-    R = A[..., :3, :3]
-    T = A[..., :3, 3:]
-    P = A[..., 3:, :]
-    return torch.cat([torch.cat([R.mT, -R.mT @ T], dim=-1), P], dim=-2)
-
-
-# -----------------------------------------------------------------------------
-# Quaternion <-> rotation matrix (xyzw / scalar-last)
-# -----------------------------------------------------------------------------
-
-
-def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
-    """``sqrt(max(0, x))`` with a zero subgradient where ``x == 0``."""
-    ret = torch.zeros_like(x)
-    positive_mask = x > 0
-    if torch.is_grad_enabled():
-        ret[positive_mask] = torch.sqrt(x[positive_mask])
-    else:
-        ret = torch.where(positive_mask, torch.sqrt(x), ret)
-    return ret
-
-
-def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
-    """Force the real part of a unit quaternion (xyzw) to be non-negative."""
-    return torch.where(quaternions[..., 3:4] < 0, -quaternions, quaternions)
-
-
-def quat_to_mat(quaternions: torch.Tensor) -> torch.Tensor:
-    """Convert quaternions (xyzw) to ``(...,3,3)`` rotation matrices."""
-    i, j, k, r = torch.unbind(quaternions, -1)
-    two_s = 2.0 / (quaternions * quaternions).sum(-1)
-    o = torch.stack(
-        (
-            1 - two_s * (j * j + k * k),
-            two_s * (i * j - k * r),
-            two_s * (i * k + j * r),
-            two_s * (i * j + k * r),
-            1 - two_s * (i * i + k * k),
-            two_s * (j * k - i * r),
-            two_s * (i * k - j * r),
-            two_s * (j * k + i * r),
-            1 - two_s * (i * i + j * j),
-        ),
-        -1,
-    )
-    return o.reshape(quaternions.shape[:-1] + (3, 3))
-
-
-def mat_to_quat(matrix: torch.Tensor) -> torch.Tensor:
-    """Convert ``(...,3,3)`` rotation matrices to quaternions (xyzw)."""
-    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
-        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
-
-    batch_dim = matrix.shape[:-2]
-    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
-        matrix.reshape(batch_dim + (9,)), dim=-1
-    )
-
-    q_abs = _sqrt_positive_part(
-        torch.stack(
-            [
-                1.0 + m00 + m11 + m22,
-                1.0 + m00 - m11 - m22,
-                1.0 - m00 + m11 - m22,
-                1.0 - m00 - m11 + m22,
-            ],
-            dim=-1,
-        )
-    )
-
-    quat_by_rijk = torch.stack(
-        [
-            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
-            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
-            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
-            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
-        ],
-        dim=-2,
-    )
-
-    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
-    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
-
-    out = quat_candidates[F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :].reshape(
-        batch_dim + (4,)
-    )
-    # Reorder rijk -> xyzw (i.e. ijkr).
-    out = out[..., [1, 2, 3, 0]]
-    return standardize_quaternion(out)
-
-
-# -----------------------------------------------------------------------------
-# Pose-encoding <-> extrinsics + intrinsics
-# -----------------------------------------------------------------------------
-
-
-def extri_intri_to_pose_encoding(
-    extrinsics: torch.Tensor,
-    intrinsics: torch.Tensor,
-    image_size_hw: Tuple[int, int],
-) -> torch.Tensor:
-    """Pack ``(extr, intr, image_size)`` into the 9-D pose-encoding vector.
-
-    ``extrinsics`` are camera-to-world (c2w) ``(B,S,4,4)`` matrices,
-    ``intrinsics`` are pixel-space ``(B,S,3,3)`` matrices, ``image_size_hw``
-    is a ``(H, W)`` pair. The encoding is ``[T(3), quat_xyzw(4), fov_h, fov_w]``.
-    """
-    R = extrinsics[..., :3, :3]
-    T = extrinsics[..., :3, 3]
-    quat = mat_to_quat(R)
-    H, W = image_size_hw
-    fov_h = 2 * torch.atan((H / 2) / intrinsics[..., 1, 1])
-    fov_w = 2 * torch.atan((W / 2) / intrinsics[..., 0, 0])
-    return torch.cat([T, quat, fov_h[..., None], fov_w[..., None]], dim=-1).float()
-
-
-def pose_encoding_to_extri_intri(
-    pose_encoding: torch.Tensor,
-    image_size_hw: Tuple[int, int],
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """Inverse of :func:`extri_intri_to_pose_encoding`.
-
-    Returns a ``(B,S,3,4)`` c2w extrinsic matrix and a ``(B,S,3,3)``
-    pixel-space intrinsic matrix.
-    """
-    T = pose_encoding[..., :3]
-    quat = pose_encoding[..., 3:7]
-    fov_h = pose_encoding[..., 7]
-    fov_w = pose_encoding[..., 8]
-    R = quat_to_mat(quat)
-    extrinsics = torch.cat([R, T[..., None]], dim=-1)
-    H, W = image_size_hw
-    fy = (H / 2.0) / torch.clamp(torch.tan(fov_h / 2.0), 1e-6)
-    fx = (W / 2.0) / torch.clamp(torch.tan(fov_w / 2.0), 1e-6)
-    intrinsics = torch.zeros(pose_encoding.shape[:2] + (3, 3),
-                             device=pose_encoding.device, dtype=pose_encoding.dtype)
-    intrinsics[..., 0, 0] = fx
-    intrinsics[..., 1, 1] = fy
-    intrinsics[..., 0, 2] = W / 2
-    intrinsics[..., 1, 2] = H / 2
-    intrinsics[..., 2, 2] = 1.0
-    return extrinsics, intrinsics
--- a/comfy/ldm/hunyuan3dv2_1/hunyuandit.py
+++ b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py
@ -328,7 +328,7 @@ class CrossAttention(nn.Module):
        kv = torch.cat((k, v), dim=-1)
        split_size = kv.shape[-1] // self.num_heads // 2

-        kv = kv.view(1, -1, self.num_heads, split_size * 2)
+        kv = kv.view(b, -1, self.num_heads, split_size * 2)
        k, v = torch.split(kv, split_size, dim=-1)

        q = q.view(b, s1, self.num_heads, self.head_dim)
@ -398,7 +398,7 @@ class Attention(nn.Module):
        qkv_combined = torch.cat((query, key, value), dim=-1)
        split_size = qkv_combined.shape[-1] // self.num_heads // 3

-        qkv = qkv_combined.view(1, -1, self.num_heads, split_size * 3)
+        qkv = qkv_combined.view(B, -1, self.num_heads, split_size * 3)
        query, key, value = torch.split(qkv, split_size, dim=-1)

        query = query.reshape(B, N, self.num_heads, self.head_dim)
@ -607,9 +607,9 @@ class HunYuanDiTPlain(nn.Module):
    def forward(self, x, t, context, transformer_options = {}, **kwargs):

        x = x.movedim(-1, -2)
-        uncond_emb, cond_emb = context.chunk(2, dim = 0)
-
-        context = torch.cat([cond_emb, uncond_emb], dim = 0)
+        if context.shape[0] >= 2:
+            uncond_emb, cond_emb = context.chunk(2, dim = 0)
+            context = torch.cat([cond_emb, uncond_emb], dim = 0)
        main_condition = context

        t = 1.0 - t
@ -657,5 +657,8 @@ class HunYuanDiTPlain(nn.Module):
        output = self.final_layer(combined)
        output =  output.movedim(-2, -1) * (-1.0)

-        cond_emb, uncond_emb = output.chunk(2, dim = 0)
-        return torch.cat([uncond_emb, cond_emb])
+        if output.shape[0] >= 2:
+            cond_emb, uncond_emb = output.chunk(2, dim = 0)
+            return torch.cat([uncond_emb, cond_emb])
+        else:
+            return output
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -60,7 +60,6 @@ import comfy.ldm.ernie.model
 import comfy.ldm.sam3.detector
 import comfy.ldm.hidream_o1.model
 from comfy.ldm.hidream_o1.conditioning import build_extra_conds
-import comfy.ldm.depth_anything_3.model

 import comfy.model_management
 import comfy.patcher_extension
@ -2043,12 +2042,6 @@ class RT_DETR_v4(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.rt_detr.rtdetr_v4.RTv4)

-
-class DepthAnything3(BaseModel):
-    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
-        super().__init__(model_config, model_type, device=device,
-                         unet_model=comfy.ldm.depth_anything_3.model.DepthAnything3Net)
-
 class ErnieImage(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.ernie.model.ErnieImageModel)
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@ -766,108 +766,6 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        dit_config["enc_h"] = state_dict['{}encoder.pan_blocks.1.cv4.conv.weight'.format(key_prefix)].shape[0]
        return dit_config

-    # Depth Anything 3
-    if '{}backbone.pretrained.patch_embed.proj.weight'.format(key_prefix) in state_dict_keys:
-        dit_config = {}
-        dit_config["image_model"] = "DepthAnything3"
-
-        patch_w = state_dict['{}backbone.pretrained.patch_embed.proj.weight'.format(key_prefix)]
-        embed_dim = patch_w.shape[0]
-        depth = count_blocks(state_dict_keys, '{}backbone.pretrained.blocks.'.format(key_prefix) + '{}.')
-
-        # Backbone preset is determined by embed_dim (matches vits/vitb/vitl/vitg).
-        backbone_name = {384: "vits", 768: "vitb", 1024: "vitl", 1536: "vitg"}.get(embed_dim)
-        if backbone_name is None:
-            return None
-        dit_config["backbone_name"] = backbone_name
-
-        # Detect DA3 extensions on top of vanilla DINOv2.
-        has_camera_token = '{}backbone.pretrained.camera_token'.format(key_prefix) in state_dict_keys
-        # qk-norm shows up as `attn.q_norm.weight` on enabled blocks.
-        qknorm_indices = [
-            i for i in range(depth)
-            if '{}backbone.pretrained.blocks.{}.attn.q_norm.weight'.format(key_prefix, i) in state_dict_keys
-        ]
-        qknorm_start = qknorm_indices[0] if qknorm_indices else -1
-
-        # The DA3 main-series configs always set alt_start == qknorm_start == rope_start.
-        # cat_token=True is implied by the presence of camera_token.
-        if has_camera_token:
-            dit_config["alt_start"] = qknorm_start
-            dit_config["rope_start"] = qknorm_start
-            dit_config["qknorm_start"] = qknorm_start
-            dit_config["cat_token"] = True
-        else:
-            dit_config["alt_start"] = -1
-            dit_config["rope_start"] = -1
-            dit_config["qknorm_start"] = -1
-            dit_config["cat_token"] = False
-
-        # Detect head type and config.
-        has_aux = '{}head.scratch.refinenet1_aux.out_conv.weight'.format(key_prefix) in state_dict_keys
-        if has_aux:
-            dit_config["head_type"] = "dualdpt"
-            # DualDPT: dim_in = 2 * embed_dim (because cat_token doubles token width).
-            head_dim_in = state_dict['{}head.projects.0.weight'.format(key_prefix)].shape[1]
-            out_channels = [
-                state_dict['{}head.projects.{}.weight'.format(key_prefix, i)].shape[0]
-                for i in range(4)
-            ]
-            features = state_dict['{}head.scratch.refinenet1.out_conv.weight'.format(key_prefix)].shape[0]
-            dit_config["head_dim_in"] = head_dim_in
-            dit_config["head_output_dim"] = 2
-            dit_config["head_features"] = features
-            dit_config["head_out_channels"] = out_channels
-            dit_config["head_use_sky_head"] = False
-        else:
-            dit_config["head_type"] = "dpt"
-            head_dim_in = state_dict['{}head.projects.0.weight'.format(key_prefix)].shape[1]
-            out_channels = [
-                state_dict['{}head.projects.{}.weight'.format(key_prefix, i)].shape[0]
-                for i in range(4)
-            ]
-            features = state_dict['{}head.scratch.refinenet1.out_conv.weight'.format(key_prefix)].shape[0]
-            output_dim = state_dict[
-                '{}head.scratch.output_conv2.2.weight'.format(key_prefix)
-            ].shape[0]
-            dit_config["head_dim_in"] = head_dim_in
-            dit_config["head_output_dim"] = output_dim
-            dit_config["head_features"] = features
-            dit_config["head_out_channels"] = out_channels
-            dit_config["head_use_sky_head"] = (
-                '{}head.scratch.sky_output_conv2.0.weight'.format(key_prefix) in state_dict_keys
-            )
-
-        # out_layers: hard-coded per upstream YAML config (depth-aware default).
-        if depth >= 24:
-            # vitl: depths used vary between DA3-Large (DualDPT) and Mono/Metric (DPT).
-            if has_aux:
-                dit_config["out_layers"] = [11, 15, 19, 23]
-            else:
-                dit_config["out_layers"] = [4, 11, 17, 23]
-        else:
-            # vits/vitb: 12 blocks
-            dit_config["out_layers"] = [5, 7, 9, 11]
-
-        # Camera encoder/decoder presence (multi-view + pose path).
-        has_cam_enc = '{}cam_enc.token_norm.weight'.format(key_prefix) in state_dict_keys
-        has_cam_dec = '{}cam_dec.fc_t.weight'.format(key_prefix) in state_dict_keys
-        dit_config["has_cam_enc"] = has_cam_enc
-        dit_config["has_cam_dec"] = has_cam_dec
-        if has_cam_enc:
-            cam_enc_w = state_dict.get(
-                '{}cam_enc.pose_branch.fc2.weight'.format(key_prefix)
-            )
-            if cam_enc_w is not None:
-                dit_config["cam_dim_out"] = cam_enc_w.shape[0]
-        if has_cam_dec:
-            cam_dec_w = state_dict.get(
-                '{}cam_dec.fc_t.weight'.format(key_prefix)
-            )
-            if cam_dec_w is not None:
-                dit_config["cam_dec_dim_in"] = cam_dec_w.shape[1]
-        return dit_config
-
    if '{}layers.0.mlp.linear_fc2.weight'.format(key_prefix) in state_dict_keys: # Ernie Image
        dit_config = {}
        dit_config["image_model"] = "ernie"
--- a/comfy/ops.py
+++ b/comfy/ops.py
@ -260,7 +260,7 @@ def resolve_cast_module_with_vbar(s, dtype, device, bias_dtype, compute_dtype, w


 def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, offloadable=False, compute_dtype=None, want_requant=False):
-    # NOTE: offloadable=False is a a legacy and if you are a custom node author reading this please pass
+    # NOTE: offloadable=False is a legacy mode and if you are a custom node author reading this please pass
    # offloadable=True and call uncast_bias_weight() after your last usage of the weight/bias. This
    # will add async-offload support to your cast and improve performance.
    if input is not None:
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -1847,101 +1847,6 @@ class RT_DETR_v4(supported_models_base.BASE):
        return None


-class DepthAnything3(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "DepthAnything3",
-    }
-
-    # Mono path: no num_heads / num_head_channels needed.
-    unet_extra_config = {}
-
-    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        return model_base.DepthAnything3(self, device=device)
-
-    def clip_target(self, state_dict={}):
-        return None
-
-    def process_unet_state_dict(self, state_dict):
-        # Drop Gaussian-head weights; remap fused backbone QKV to Dinov2Model layout.
-        drop_prefixes = ("gs_head.", "gs_adapter.")
-        for k in list(state_dict.keys()):
-            if k.startswith(drop_prefixes):
-                state_dict.pop(k)
-        return _da3_remap_backbone_keys(state_dict, prefix="backbone.")
-
-
-def _da3_remap_backbone_keys(state_dict, prefix="backbone."):
-    """Map ``backbone.pretrained.*`` (upstream DA3) keys to ``Dinov2Model`` under ``prefix``."""
-    pre = prefix + "pretrained."
-    src_keys = [k for k in state_dict.keys() if k.startswith(pre)]
-    if not src_keys:
-        return state_dict
-
-    static_renames = {
-        pre + "patch_embed.proj.weight":  prefix + "embeddings.patch_embeddings.projection.weight",
-        pre + "patch_embed.proj.bias":    prefix + "embeddings.patch_embeddings.projection.bias",
-        pre + "pos_embed":                prefix + "embeddings.position_embeddings",
-        pre + "cls_token":                prefix + "embeddings.cls_token",
-        pre + "camera_token":             prefix + "embeddings.camera_token",
-        pre + "norm.weight":              prefix + "layernorm.weight",
-        pre + "norm.bias":                prefix + "layernorm.bias",
-    }
-    for src, dst in static_renames.items():
-        if src in state_dict:
-            state_dict[dst] = state_dict.pop(src)
-
-    block_pre = pre + "blocks."
-    block_keys = [k for k in state_dict.keys() if k.startswith(block_pre)]
-    for k in block_keys:
-        rest = k[len(block_pre):]                 # e.g. "5.attn.qkv.weight"
-        idx_str, _, sub = rest.partition(".")
-        target_block = "{}encoder.layer.{}.".format(prefix, idx_str)
-
-        # Fused QKV -> split query/key/value linears.
-        if sub == "attn.qkv.weight":
-            qkv = state_dict.pop(k)
-            c = qkv.shape[0] // 3
-            state_dict[target_block + "attention.attention.query.weight"] = qkv[:c].clone()
-            state_dict[target_block + "attention.attention.key.weight"]   = qkv[c:2 * c].clone()
-            state_dict[target_block + "attention.attention.value.weight"] = qkv[2 * c:].clone()
-            continue
-        if sub == "attn.qkv.bias":
-            qkv = state_dict.pop(k)
-            c = qkv.shape[0] // 3
-            state_dict[target_block + "attention.attention.query.bias"] = qkv[:c].clone()
-            state_dict[target_block + "attention.attention.key.bias"]   = qkv[c:2 * c].clone()
-            state_dict[target_block + "attention.attention.value.bias"] = qkv[2 * c:].clone()
-            continue
-
-        # Sub-key remap (suffix preserved).
-        if sub.startswith("attn.proj."):
-            tail = sub[len("attn.proj."):]
-            new = "attention.output.dense." + tail
-        elif sub.startswith("attn.q_norm."):
-            new = "attention.q_norm." + sub[len("attn.q_norm."):]
-        elif sub.startswith("attn.k_norm."):
-            new = "attention.k_norm." + sub[len("attn.k_norm."):]
-        elif sub == "ls1.gamma":
-            new = "layer_scale1.lambda1"
-        elif sub == "ls2.gamma":
-            new = "layer_scale2.lambda1"
-        elif sub.startswith("mlp.w12."):
-            new = "mlp.weights_in." + sub[len("mlp.w12."):]
-        elif sub.startswith("mlp.w3."):
-            new = "mlp.weights_out." + sub[len("mlp.w3."):]
-        elif sub.startswith(("norm1.", "norm2.", "mlp.fc1.", "mlp.fc2.")):
-            new = sub
-        else:
-            # Unrecognised key -- leave as-is so load_state_dict can complain.
-            continue
-
-        state_dict[target_block + new] = state_dict.pop(k)
-
-    return state_dict
-
-
 class ErnieImage(supported_models_base.BASE):
    unet_config = {
        "image_model": "ernie",
@ -2177,5 +2082,4 @@ models = [
    CogVideoX_I2V,
    CogVideoX_T2V,
    SVD_img2vid,
-    DepthAnything3,
 ]
--- a/comfy_api_nodes/apis/anthropic.py
+++ b/comfy_api_nodes/apis/anthropic.py
@ -35,6 +35,19 @@ class AnthropicMessage(BaseModel):
    content: list[AnthropicTextContent | AnthropicImageContent] = Field(...)


+class AnthropicThinkingConfig(BaseModel):
+    type: Literal["enabled", "disabled", "adaptive"] = Field(...)
+    budget_tokens: int | None = Field(
+        None, ge=1024,
+        description="Reasoning budget in tokens. Used when type is 'enabled'. Must be less than max_tokens.",
+    )
+
+
+class AnthropicOutputConfig(BaseModel):
+    """Used with `thinking.type='adaptive'` on models like Opus 4.7."""
+    effort: Literal["low", "medium", "high"] | None = Field(None)
+
+
 class AnthropicMessagesRequest(BaseModel):
    model: str = Field(...)
    messages: list[AnthropicMessage] = Field(...)
@ -44,6 +57,8 @@ class AnthropicMessagesRequest(BaseModel):
    top_p: float | None = Field(None, ge=0.0, le=1.0)
    top_k: int | None = Field(None, ge=0)
    stop_sequences: list[str] | None = Field(None)
+    thinking: AnthropicThinkingConfig | None = Field(None)
+    output_config: AnthropicOutputConfig | None = Field(None)


 class AnthropicResponseTextBlock(BaseModel):
@ -51,6 +66,14 @@ class AnthropicResponseTextBlock(BaseModel):
    text: str = Field(...)


+class AnthropicResponseThinkingBlock(BaseModel):
+    type: Literal["thinking"] = "thinking"
+    thinking: str = Field(...)
+
+
+AnthropicResponseBlock = AnthropicResponseTextBlock | AnthropicResponseThinkingBlock
+
+
 class AnthropicCacheCreationUsage(BaseModel):
    ephemeral_5m_input_tokens: int | None = Field(None)
    ephemeral_1h_input_tokens: int | None = Field(None)
@ -69,7 +92,7 @@ class AnthropicMessagesResponse(BaseModel):
    type: str | None = Field(None)
    role: str | None = Field(None)
    model: str | None = Field(None)
-    content: list[AnthropicResponseTextBlock] | None = Field(None)
+    content: list[AnthropicResponseBlock] | None = Field(None)
    stop_reason: str | None = Field(None)
    stop_sequence: str | None = Field(None)
    usage: AnthropicMessagesUsage | None = Field(None)
--- a/comfy_api_nodes/apis/openrouter.py
+++ b/comfy_api_nodes/apis/openrouter.py
@ -0,0 +1,93 @@
+"""Pydantic models for the OpenRouter chat completions API.
+
+See: https://openrouter.ai/docs/api/api-reference/chat/send-chat-completion-request
+"""
+
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+
+class OpenRouterTextContent(BaseModel):
+    type: Literal["text"] = "text"
+    text: str = Field(...)
+
+
+class OpenRouterImageUrl(BaseModel):
+    url: str = Field(...)
+
+
+class OpenRouterImageContent(BaseModel):
+    type: Literal["image_url"] = "image_url"
+    image_url: OpenRouterImageUrl = Field(...)
+
+
+class OpenRouterVideoUrl(BaseModel):
+    url: str = Field(...)
+
+
+class OpenRouterVideoContent(BaseModel):
+    type: Literal["video_url"] = "video_url"
+    video_url: OpenRouterVideoUrl = Field(...)
+
+
+OpenRouterContentBlock = OpenRouterTextContent | OpenRouterImageContent | OpenRouterVideoContent
+
+
+class OpenRouterMessage(BaseModel):
+    role: Literal["system", "user", "assistant"] = Field(...)
+    content: str | list[OpenRouterContentBlock] = Field(...)
+
+
+class OpenRouterReasoningConfig(BaseModel):
+    effort: str | None = Field(None)
+    exclude: bool | None = Field(None, description="If true, model reasons but reasoning is excluded from response.")
+
+
+class OpenRouterWebSearchOptions(BaseModel):
+    search_context_size: str | None = Field(None)
+
+
+class OpenRouterChatRequest(BaseModel):
+    model: str = Field(...)
+    messages: list[OpenRouterMessage] = Field(...)
+    seed: int | None = Field(None)
+    reasoning: OpenRouterReasoningConfig | None = Field(None)
+    web_search_options: OpenRouterWebSearchOptions | None = Field(None)
+    stream: bool = Field(False)
+
+
+class OpenRouterUsage(BaseModel):
+    prompt_tokens: int | None = Field(None)
+    completion_tokens: int | None = Field(None)
+    total_tokens: int | None = Field(None)
+    cost: float | None = Field(None, description="Server-side authoritative USD cost of the call.")
+
+
+class OpenRouterResponseMessage(BaseModel):
+    role: str | None = Field(None)
+    content: str | None = Field(None)
+    reasoning: str | None = Field(None)
+    refusal: str | None = Field(None)
+
+
+class OpenRouterChoice(BaseModel):
+    index: int | None = Field(None)
+    message: OpenRouterResponseMessage | None = Field(None)
+    finish_reason: str | None = Field(None)
+
+
+class OpenRouterError(BaseModel):
+    code: int | str | None = Field(None)
+    message: str | None = Field(None)
+    metadata: dict | None = Field(None)
+
+
+class OpenRouterChatResponse(BaseModel):
+    id: str | None = Field(None)
+    model: str | None = Field(None)
+    object: str | None = Field(None)
+    provider: str | None = Field(None)
+    choices: list[OpenRouterChoice] | None = Field(None)
+    usage: OpenRouterUsage | None = Field(None)
+    error: OpenRouterError | None = Field(None)
--- a/comfy_api_nodes/nodes_anthropic.py
+++ b/comfy_api_nodes/nodes_anthropic.py
@ -9,8 +9,11 @@ from comfy_api_nodes.apis.anthropic import (
    AnthropicMessage,
    AnthropicMessagesRequest,
    AnthropicMessagesResponse,
+    AnthropicOutputConfig,
+    AnthropicResponseTextBlock,
    AnthropicRole,
    AnthropicTextContent,
+    AnthropicThinkingConfig,
 )
 from comfy_api_nodes.util import (
    ApiEndpoint,
@ -32,15 +35,29 @@ CLAUDE_MODELS: dict[str, str] = {
    "Haiku 4.5": "claude-haiku-4-5-20251001",
 }

+_THINKING_UNSUPPORTED = {"Haiku 4.5"}
+# Models that use the newer "adaptive" thinking mode (Opus 4.7 requires it; older models keep the explicit budget API).
+# Anthropic decides the actual budget when adaptive is used, based on the `output_config.effort` hint.
+_ADAPTIVE_THINKING_MODELS = {"Opus 4.7", "Opus 4.6", "Sonnet 4.6"}

-def _claude_model_inputs():
-    return [
+# Budget mode (Sonnet 4.5): effort -> reasoning budget in tokens. Must be < max_tokens.
+# Sized so even the "high" budget fits comfortably under the default max_tokens=32768.
+_REASONING_BUDGET: dict[str, int] = {
+    "low": 2048,
+    "medium": 8192,
+    "high": 16384,
+}
+_REASONING_EFFORTS = ["off", "low", "medium", "high"]
+
+
+def _claude_model_inputs(model_label: str):
+    inputs: list = [
        IO.Int.Input(
            "max_tokens",
-            default=16000,
-            min=32,
-            max=32000,
-            tooltip="Maximum number of tokens to generate before stopping.",
+            default=32768,
+            min=4096,
+            max=64000,
+            tooltip="Maximum number of tokens to generate (includes reasoning tokens when enabled).",
            advanced=True,
        ),
        IO.Float.Input(
@ -49,10 +66,24 @@ def _claude_model_inputs():
            min=0.0,
            max=1.0,
            step=0.01,
-            tooltip="Controls randomness. 0.0 is deterministic, 1.0 is most random. Ignored for Opus 4.7.",
+            tooltip=(
+                "Controls randomness. 0.0 is deterministic, 1.0 is most random. "
+                "Ignored for Opus 4.7 and any model when reasoning_effort is set."
+            ),
            advanced=True,
        ),
    ]
+    if model_label not in _THINKING_UNSUPPORTED:
+        inputs.append(
+            IO.Combo.Input(
+                "reasoning_effort",
+                options=_REASONING_EFFORTS,
+                default="off",
+                tooltip="Extended thinking effort. 'off' disables reasoning.",
+                advanced=True,
+            )
+        )
+    return inputs


 def _model_price_per_million(model: str) -> tuple[float, float] | None:
@ -95,7 +126,11 @@ def calculate_tokens_price(response: AnthropicMessagesResponse) -> float | None:
 def _get_text_from_response(response: AnthropicMessagesResponse) -> str:
    if not response.content:
        return ""
-    return "\n".join(block.text for block in response.content if block.text)
+    # Thinking blocks are silently dropped — we never want reasoning in the output.
+    return "\n".join(
+        block.text for block in response.content
+        if isinstance(block, AnthropicResponseTextBlock) and block.text
+    )


 async def _build_image_content_blocks(
@ -133,7 +168,10 @@ class ClaudeNode(IO.ComfyNode):
                ),
                IO.DynamicCombo.Input(
                    "model",
-                    options=[IO.DynamicCombo.Option(label, _claude_model_inputs()) for label in CLAUDE_MODELS],
+                    options=[
+                        IO.DynamicCombo.Option(label, _claude_model_inputs(label))
+                        for label in CLAUDE_MODELS
+                    ],
                    tooltip="The Claude model used to generate the response.",
                ),
                IO.Int.Input(
@ -207,8 +245,29 @@ class ClaudeNode(IO.ComfyNode):
    ) -> IO.NodeOutput:
        validate_string(prompt, strip_whitespace=True, min_length=1)
        model_label = model["model"]
-        max_tokens = model["max_tokens"]
-        temperature = None if model_label == "Opus 4.7" else model["temperature"]
+        max_tokens = model.get("max_tokens", 32768)
+        reasoning_effort = model.get("reasoning_effort", "off")
+        thinking_enabled = reasoning_effort not in ("off", None) and model_label not in _THINKING_UNSUPPORTED
+
+        # Anthropic requires temperature to be unset (defaults to 1.0) when thinking is enabled.
+        # Opus 4.7 also rejects user-supplied temperature.
+        if thinking_enabled or model_label == "Opus 4.7":
+            temperature = None
+        else:
+            temperature = model.get("temperature", 1.0)
+
+        thinking_cfg: AnthropicThinkingConfig | None = None
+        output_cfg: AnthropicOutputConfig | None = None
+        if thinking_enabled:
+            if model_label in _ADAPTIVE_THINKING_MODELS:
+                # Adaptive mode - Anthropic chooses the budget based on effort hint
+                thinking_cfg = AnthropicThinkingConfig(type="adaptive")
+                output_cfg = AnthropicOutputConfig(effort=reasoning_effort)
+            else:
+                # Budget mode (Sonnet 4.5). Leave at least 1024 tokens for the actual response
+                budget = _REASONING_BUDGET[reasoning_effort]
+                budget = min(budget, max(1024, max_tokens - 1024))
+                thinking_cfg = AnthropicThinkingConfig(type="enabled", budget_tokens=budget)

        image_tensors: list[Input.Image] = [t for t in (images or {}).values() if t is not None]
        if sum(get_number_of_images(t) for t in image_tensors) > CLAUDE_MAX_IMAGES:
@ -229,6 +288,8 @@ class ClaudeNode(IO.ComfyNode):
                messages=[AnthropicMessage(role=AnthropicRole.user, content=content)],
                system=system_prompt or None,
                temperature=temperature,
+                thinking=thinking_cfg,
+                output_config=output_cfg,
            ),
            price_extractor=calculate_tokens_price,
        )
--- a/comfy_api_nodes/nodes_openrouter.py
+++ b/comfy_api_nodes/nodes_openrouter.py
@ -0,0 +1,374 @@
+"""API Nodes for OpenRouter LLM chat completions."""
+
+from dataclasses import dataclass
+from typing import Literal
+
+from typing_extensions import override
+
+from comfy_api.latest import IO, ComfyExtension, Input
+from comfy_api_nodes.apis.openrouter import (
+    OpenRouterChatRequest,
+    OpenRouterChatResponse,
+    OpenRouterContentBlock,
+    OpenRouterImageContent,
+    OpenRouterImageUrl,
+    OpenRouterMessage,
+    OpenRouterReasoningConfig,
+    OpenRouterTextContent,
+    OpenRouterVideoContent,
+    OpenRouterVideoUrl,
+    OpenRouterWebSearchOptions,
+)
+from comfy_api_nodes.util import (
+    ApiEndpoint,
+    get_number_of_images,
+    sync_op,
+    upload_images_to_comfyapi,
+    upload_video_to_comfyapi,
+    validate_string,
+)
+
+OPENROUTER_CHAT_ENDPOINT = "/proxy/openrouter/api/v1/chat/completions"
+
+
+Profile = Literal["standard", "reasoning", "frontier_reasoning", "perplexity", "perplexity_reasoning"]
+
+
+@dataclass(frozen=True)
+class _ModelSpec:
+    slug: str  # exact OpenRouter model id
+    profile: Profile
+    price_in: float  # USD per token (prompt)
+    price_out: float  # USD per token (completion)
+    max_images: int = 0  # 0 = no image input; otherwise max URL-passed images supported
+    max_videos: int = 0  # 0 = no video input; otherwise max URL-passed videos supported
+
+
+MODELS: list[_ModelSpec] = [
+    _ModelSpec("anthropic/claude-opus-4.7", "frontier_reasoning", 0.000005, 0.000025, max_images=20),
+    _ModelSpec("openai/gpt-5.5-pro", "frontier_reasoning", 0.00003, 0.00018, max_images=20),
+    _ModelSpec("openai/gpt-5.5", "frontier_reasoning", 0.000005, 0.00003, max_images=20),
+    _ModelSpec("google/gemini-3.5-flash", "reasoning", 0.0000015, 0.000009, max_images=20, max_videos=4),
+    _ModelSpec("x-ai/grok-4.20", "reasoning", 0.00000125, 0.0000025, max_images=20),
+    _ModelSpec("x-ai/grok-4.3", "reasoning", 0.00000125, 0.0000025, max_images=20),
+    _ModelSpec("deepseek/deepseek-v4-pro", "reasoning", 0.000000435, 0.00000087),
+    _ModelSpec("deepseek/deepseek-v4-flash", "reasoning", 0.000000112, 0.000000224),
+    _ModelSpec("deepseek/deepseek-v3.2", "reasoning", 0.000000252, 0.000000378),
+    _ModelSpec("qwen/qwen3.6-max-preview", "reasoning", 0.00000104, 0.00000624),
+    _ModelSpec("qwen/qwen3.6-plus", "reasoning", 0.000000325, 0.00000195, max_images=10, max_videos=4),
+    _ModelSpec("qwen/qwen3.6-flash", "reasoning", 0.0000001875, 0.000001125, max_images=10, max_videos=4),
+    _ModelSpec("mistralai/mistral-large-2512", "standard", 0.0000005, 0.0000015, max_images=8),
+    _ModelSpec("mistralai/mistral-medium-3-5", "reasoning", 0.0000015, 0.0000075, max_images=8),
+    _ModelSpec("z-ai/glm-4.6", "reasoning", 0.00000043, 0.00000174),
+    _ModelSpec("z-ai/glm-5", "reasoning", 0.0000006, 0.00000192),
+    _ModelSpec("moonshotai/kimi-k2.6", "reasoning", 0.00000073, 0.00000349, max_images=10),
+    _ModelSpec("moonshotai/kimi-k2-thinking", "reasoning", 0.0000006, 0.0000025),
+    _ModelSpec("perplexity/sonar-pro", "perplexity", 0.000003, 0.000015),
+    _ModelSpec("perplexity/sonar-reasoning-pro", "perplexity_reasoning", 0.000002, 0.000008),
+    _ModelSpec("perplexity/sonar-deep-research", "perplexity_reasoning", 0.000002, 0.000008),
+]
+
+_MODELS_BY_SLUG: dict[str, _ModelSpec] = {m.slug: m for m in MODELS}
+_REASONING_EFFORTS = ["off", "low", "medium", "high"]
+_SEARCH_CONTEXT_SIZES = ["low", "medium", "high"]
+
+
+def _reasoning_extra_inputs() -> list:
+    return [
+        IO.Combo.Input(
+            "reasoning_effort",
+            options=_REASONING_EFFORTS,
+            default="off",
+            tooltip="Reasoning effort. 'off' disables reasoning entirely.",
+            advanced=True,
+        ),
+    ]
+
+
+def _perplexity_extra_inputs() -> list:
+    return [
+        IO.Combo.Input(
+            "search_context_size",
+            options=_SEARCH_CONTEXT_SIZES,
+            default="medium",
+            tooltip="How much web search context to retrieve. Larger = more grounded but slower/pricier.",
+            advanced=True,
+        ),
+    ]
+
+
+def _profile_inputs(profile: Profile) -> list:
+    if profile == "standard":
+        return []
+    if profile in ("reasoning", "frontier_reasoning"):
+        return _reasoning_extra_inputs()
+    if profile == "perplexity":
+        return _perplexity_extra_inputs()
+    if profile == "perplexity_reasoning":
+        return _perplexity_extra_inputs() + _reasoning_extra_inputs()
+    raise ValueError(f"Unknown profile: {profile}")
+
+
+def _media_inputs(spec: _ModelSpec) -> list:
+    extras: list = []
+    if spec.max_images > 0:
+        extras.append(
+            IO.Autogrow.Input(
+                "images",
+                template=IO.Autogrow.TemplateNames(
+                    IO.Image.Input("image"),
+                    names=[f"image_{i}" for i in range(1, spec.max_images + 1)],
+                    min=0,
+                ),
+                tooltip=f"Optional reference image(s) — up to {spec.max_images}. Sent as URLs.",
+            )
+        )
+    if spec.max_videos > 0:
+        extras.append(
+            IO.Autogrow.Input(
+                "videos",
+                template=IO.Autogrow.TemplateNames(
+                    IO.Video.Input("video"),
+                    names=[f"video_{i}" for i in range(1, spec.max_videos + 1)],
+                    min=0,
+                ),
+                tooltip=f"Optional reference video(s) — up to {spec.max_videos}. Sent as URLs.",
+            )
+        )
+    return extras
+
+
+def _inputs_for_model(spec: _ModelSpec) -> list:
+    return _profile_inputs(spec.profile) + _media_inputs(spec)
+
+
+def _build_model_options() -> list[IO.DynamicCombo.Option]:
+    return [IO.DynamicCombo.Option(spec.slug, _inputs_for_model(spec)) for spec in MODELS]
+
+
+def _calculate_price(response: OpenRouterChatResponse) -> float | None:
+    if response.usage and response.usage.cost is not None:
+        return float(response.usage.cost)
+    return None
+
+
+def _price_badge_jsonata() -> str:
+    rates_pairs = []
+    for spec in MODELS:
+        prompt_per_1k = spec.price_in * 1000
+        completion_per_1k = spec.price_out * 1000
+        rates_pairs.append(f'  "{spec.slug}": [{prompt_per_1k:.8g}, {completion_per_1k:.8g}]')
+    rates_block = ",\n".join(rates_pairs)
+    return (
+        "(\n"
+        "  $rates := {\n"
+        f"{rates_block}\n"
+        "  };\n"
+        "  $r := $lookup($rates, widgets.model);\n"
+        "  $r ? {\n"
+        '    "type": "list_usd",\n'
+        '    "usd": $r,\n'
+        '    "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }\n'
+        '  } : {"type": "text", "text": "Token-based"}\n'
+        ")"
+    )
+
+
+async def _build_image_blocks(
+    cls: type[IO.ComfyNode], spec: _ModelSpec, images: list[Input.Image]
+) -> list[OpenRouterImageContent]:
+    urls = await upload_images_to_comfyapi(
+        cls,
+        images,
+        max_images=spec.max_images,
+        total_pixels=2048 * 2048,
+        mime_type="image/png",
+        wait_label="Uploading reference images",
+    )
+    return [OpenRouterImageContent(image_url=OpenRouterImageUrl(url=url)) for url in urls]
+
+
+async def _build_video_blocks(cls: type[IO.ComfyNode], videos: list[Input.Video]) -> list[OpenRouterVideoContent]:
+    blocks: list[OpenRouterVideoContent] = []
+    total = len(videos)
+    for idx, video in enumerate(videos):
+        label = "Uploading reference video"
+        if total > 1:
+            label = f"{label} ({idx + 1}/{total})"
+        url = await upload_video_to_comfyapi(cls, video, wait_label=label)
+        blocks.append(OpenRouterVideoContent(video_url=OpenRouterVideoUrl(url=url)))
+    return blocks
+
+
+def _user_message(prompt: str, media_blocks: list[OpenRouterContentBlock]) -> OpenRouterMessage:
+    if not media_blocks:
+        return OpenRouterMessage(role="user", content=prompt)
+    blocks: list[OpenRouterContentBlock] = list(media_blocks)
+    blocks.append(OpenRouterTextContent(text=prompt))
+    return OpenRouterMessage(role="user", content=blocks)
+
+
+def _build_messages(
+    system_prompt: str, prompt: str, media_blocks: list[OpenRouterContentBlock]
+) -> list[OpenRouterMessage]:
+    messages: list[OpenRouterMessage] = []
+    if system_prompt:
+        messages.append(OpenRouterMessage(role="system", content=system_prompt))
+    messages.append(_user_message(prompt, media_blocks))
+    return messages
+
+
+def _build_request(
+    slug: str,
+    system_prompt: str,
+    prompt: str,
+    media_blocks: list[OpenRouterContentBlock],
+    *,
+    seed: int,
+    reasoning_effort: str | None,
+    search_context_size: str | None,
+) -> OpenRouterChatRequest:
+    reasoning_cfg: OpenRouterReasoningConfig | None = None
+    if reasoning_effort and reasoning_effort != "off":
+        # exclude=True asks providers to reason internally but not return the trace
+        reasoning_cfg = OpenRouterReasoningConfig(effort=reasoning_effort, exclude=True)
+    web_search_cfg: OpenRouterWebSearchOptions | None = None
+    if search_context_size:
+        web_search_cfg = OpenRouterWebSearchOptions(search_context_size=search_context_size)
+    return OpenRouterChatRequest(
+        model=slug,
+        messages=_build_messages(system_prompt, prompt, media_blocks),
+        seed=seed if seed > 0 else None,
+        reasoning=reasoning_cfg,
+        web_search_options=web_search_cfg,
+    )
+
+
+def _extract_text(response: OpenRouterChatResponse) -> str:
+    if response.error:
+        code = response.error.code if response.error.code is not None else "unknown"
+        raise ValueError(f"OpenRouter error ({code}): {response.error.message or 'no message'}")
+    if not response.choices:
+        raise ValueError("Empty response from OpenRouter (no choices).")
+    message = response.choices[0].message
+    if not message:
+        raise ValueError("Empty response from OpenRouter (no message).")
+    if message.refusal:
+        raise ValueError(f"Model refused to respond: {message.refusal}")
+    return message.content or ""
+
+
+class OpenRouterLLMNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="OpenRouterLLMNode",
+            display_name="OpenRouter LLM",
+            category="api node/text/OpenRouter",
+            essentials_category="Text Generation",
+            description=(
+                "Generate text responses through OpenRouter. Routes to a curated set of popular "
+                "models from xAI, DeepSeek, Qwen, Mistral, Z.AI (GLM), Moonshot (Kimi), and "
+                "Perplexity Sonar."
+            ),
+            inputs=[
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Text input to the model.",
+                ),
+                IO.DynamicCombo.Input(
+                    "model",
+                    options=_build_model_options(),
+                    tooltip="The OpenRouter model used to generate the response.",
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    control_after_generate=True,
+                    tooltip="Seed for sampling. Set to 0 to omit. Most models treat this as a hint only.",
+                ),
+                IO.String.Input(
+                    "system_prompt",
+                    multiline=True,
+                    default="",
+                    optional=True,
+                    advanced=True,
+                    tooltip="Foundational instructions that dictate the model's behavior.",
+                ),
+            ],
+            outputs=[IO.String.Output()],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["model"]),
+                expr=_price_badge_jsonata(),
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        prompt: str,
+        model: dict,
+        seed: int,
+        system_prompt: str = "",
+    ) -> IO.NodeOutput:
+        validate_string(prompt, strip_whitespace=True, min_length=1)
+        slug: str = model["model"]
+        spec = _MODELS_BY_SLUG.get(slug)
+        if spec is None:
+            raise ValueError(f"Unknown OpenRouter model: {slug}")
+
+        reasoning_effort: str | None = model.get("reasoning_effort")
+        search_context_size: str | None = model.get("search_context_size")
+
+        image_tensors: list[Input.Image] = [t for t in (model.get("images") or {}).values() if t is not None]
+        if image_tensors and sum(get_number_of_images(t) for t in image_tensors) > spec.max_images:
+            raise ValueError(f"Up to {spec.max_images} images are supported for {slug}.")
+        video_inputs: list[Input.Video] = [v for v in (model.get("videos") or {}).values() if v is not None]
+        if video_inputs and len(video_inputs) > spec.max_videos:
+            raise ValueError(f"Up to {spec.max_videos} videos are supported for {slug}.")
+
+        media_blocks: list[OpenRouterContentBlock] = []
+        if image_tensors:
+            media_blocks.extend(await _build_image_blocks(cls, spec, image_tensors))
+        if video_inputs:
+            media_blocks.extend(await _build_video_blocks(cls, video_inputs))
+
+        request = _build_request(
+            slug,
+            system_prompt,
+            prompt,
+            media_blocks,
+            seed=seed,
+            reasoning_effort=reasoning_effort,
+            search_context_size=search_context_size,
+        )
+
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path=OPENROUTER_CHAT_ENDPOINT, method="POST"),
+            response_model=OpenRouterChatResponse,
+            data=request,
+            price_extractor=_calculate_price,
+        )
+        return IO.NodeOutput(_extract_text(response))
+
+
+class OpenRouterExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+        return [OpenRouterLLMNode]
+
+
+async def comfy_entrypoint() -> OpenRouterExtension:
+    return OpenRouterExtension()
--- a/comfy_extras/nodes_depth_anything_3.py
+++ b/comfy_extras/nodes_depth_anything_3.py
@ -1,435 +0,0 @@
-"""ComfyUI nodes for Depth Anything 3.
-
-Adds these nodes:
-
-* ``LoadDepthAnything3`` -- load a DA3 ``.safetensors`` file from the
-  ``models/geometry_estimation/`` folder.
-* ``DepthAnything3`` -- unified depth estimation node supporting both mono and
-  multi-view modes via a DynamicCombo selector. In mono mode, returns a
-  normalised depth image plus sky/confidence masks. In multi-view mode,
-  additionally returns per-view extrinsics, intrinsics and raw depth packed
-  as a LATENT.
-
-Model capability matrix
-----------------------
-  Variant               head_type  has_sky  has_conf  cam_dec
-  DA3-Small             dualdpt    False    True      yes
-  DA3-Base              dualdpt    False    True      yes
-  DA3-Mono-Large        dpt        True     False     no
-  DA3-Metric-Large      dpt        True     False     no  (raw output is metres)
-
-The node raises a ``ValueError`` at execution time when the selected
-parameters conflict with the loaded model's capabilities (e.g.
-``apply_sky_clip=True`` on a model with no sky head).
-"""
-
-from __future__ import annotations
-
-from typing_extensions import override
-
-import torch
-
-import comfy.model_management as mm
-import comfy.sd
-import folder_paths
-from comfy.ldm.depth_anything_3 import preprocess as da3_preprocess
-from comfy_api.latest import ComfyExtension, io
-
-
-class LoadDepthAnything3(io.ComfyNode):
-    @classmethod
-    def define_schema(cls):
-        return io.Schema(
-            node_id="LoadDepthAnything3",
-            display_name="Load Depth Anything 3",
-            category="loaders/geometry_estimation",
-            inputs=[
-                io.Combo.Input(
-                    "model_name",
-                    options=folder_paths.get_filename_list("geometry_estimation"),
-                ),
-                io.Combo.Input(
-                    "weight_dtype",
-                    options=["default", "fp16", "bf16", "fp32"],
-                    default="default",
-                ),
-            ],
-            outputs=[io.Model.Output("model")],
-        )
-
-    @classmethod
-    def execute(cls, model_name, weight_dtype) -> io.NodeOutput:
-        model_options = {}
-        if weight_dtype == "fp16":
-            model_options["dtype"] = torch.float16
-        elif weight_dtype == "bf16":
-            model_options["dtype"] = torch.bfloat16
-        elif weight_dtype == "fp32":
-            model_options["dtype"] = torch.float32
-
-        path = folder_paths.get_full_path_or_raise("geometry_estimation", model_name)
-        model = comfy.sd.load_diffusion_model(path, model_options=model_options)
-        return io.NodeOutput(model)
-
-
-def _normalize_confidence(conf: torch.Tensor) -> torch.Tensor:
-    """Map raw confidence (expp1 activaton, range [1, ∞)) to [0, 1] per image.
-
-    The model uses ``exp(x) + 1`` so every pixel is guaranteed to be ≥ 1.
-    Min-max normalization per image preserves the spatial pattern (high
-    confidence = brighter) while producing a valid mask in [0, 1].
-    """
-    B = conf.shape[0]
-    out = []
-    for i in range(B):
-        c = conf[i]
-        c_min = c.min()
-        c_max = c.max()
-        if c_max > c_min:
-            out.append((c - c_min) / (c_max - c_min))
-        else:
-            out.append(torch.ones_like(c))
-    return torch.stack(out, dim=0)
-
-
-def _run_da3(model_patcher, image: torch.Tensor, process_res: int,
-             method: str = "upper_bound_resize"):
-    """Run DA3 on ``(B,H,W,3)`` IMAGE; returns depth/conf/sky at original resolution (or None)."""
-    assert image.ndim == 4 and image.shape[-1] == 3, \
-        f"expected (B,H,W,3) IMAGE; got {tuple(image.shape)}"
-
-    B, H, W, _ = image.shape
-    mm.load_model_gpu(model_patcher)
-    diffusion = model_patcher.model.diffusion_model
-    device = mm.get_torch_device()
-    dtype = diffusion.dtype if diffusion.dtype is not None else torch.float32
-
-    depths, confs, skies = [], [], []
-    for i in range(B):
-        single = image[i:i + 1].to(device)
-        x = da3_preprocess.preprocess_image(single, process_res=process_res, method=method)
-        x = x.to(dtype=dtype)
-        with torch.no_grad():
-            out = diffusion(x)
-
-        depth_lr = out["depth"]
-        depth_full = torch.nn.functional.interpolate(
-            depth_lr.unsqueeze(1).float(), size=(H, W),
-            mode="bilinear", align_corners=False,
-        ).squeeze(1).cpu()
-        depths.append(depth_full)
-
-        if "depth_conf" in out:
-            conf_full = torch.nn.functional.interpolate(
-                out["depth_conf"].unsqueeze(1).float(), size=(H, W),
-                mode="bilinear", align_corners=False,
-            ).squeeze(1).cpu()
-            confs.append(conf_full)
-        if "sky" in out:
-            sky_full = torch.nn.functional.interpolate(
-                out["sky"].unsqueeze(1).float(), size=(H, W),
-                mode="bilinear", align_corners=False,
-            ).squeeze(1).cpu()
-            skies.append(sky_full)
-
-    depth = torch.cat(depths, dim=0)
-    confidence = torch.cat(confs, dim=0) if confs else None
-    sky = torch.cat(skies, dim=0) if skies else None
-    return depth, confidence, sky
-
-
-class DepthAnything3(io.ComfyNode):
-    """Unified Depth Anything 3 node.
-
-    Mono mode
-    ---------
-    Runs the model on each batch element independently and returns a
-    normalised depth image together with sky and confidence masks.
-
-    Multi-view mode
-    ---------------
-    Treats every batch element as a separate view of the same scene.
-    Runs all views in a single forward pass so cross-view attention can
-    establish geometric consistency. Additionally returns a ``LATENT``
-    dict with per-view camera extrinsics, intrinsics and raw depth.
-
-    Capability errors
-    -----------------
-    A ``ValueError`` is raised immediately when a parameter requires a
-    model feature that is absent in the loaded checkpoint (e.g.
-    ``apply_sky_clip=True`` on DA3-Small/Base which has no sky head,
-    or ``pose_method='cam_dec'`` on a monocular model).
-
-    Camera LATENT structure (multi-view only)
-    -----------------------------------------
-      samples:    (1, S, 1, H, W)  -- raw depth packed as latent samples
-      type:       "da3_multiview"
-      extrinsics: (1, S, 4, 4)     -- world-to-camera matrices
-      intrinsics: (1, S, 3, 3)     -- pixel-space intrinsics
-      depth_raw:  (S, H, W)        -- un-normalised depth
-      confidence: (S, H, W)        -- per-pixel confidence (zeros if N/A)
-    """
-
-    @classmethod
-    def define_schema(cls):
-        return io.Schema(
-            node_id="DepthAnything3",
-            display_name="Depth Anything 3",
-            category="image/depth",
-            inputs=[
-                io.Model.Input("model"),
-                io.Image.Input("image",
-                               tooltip="Single image or image batch. "
-                                       "In multi-view mode each frame is treated as "
-                                       "a separate view of the same scene."),
-                io.Int.Input("process_res", default=504, min=140, max=2520, step=14,
-                             tooltip="Resolution the model runs at (longest side, multiple of 14). "
-                                     "Lower = faster / less VRAM; higher = more detail. "
-                                     "Output is upsampled back to the original size."),
-                io.Combo.Input("resize_method",
-                               options=["upper_bound_resize", "lower_bound_resize"],
-                               default="upper_bound_resize",
-                               tooltip="upper_bound_resize: scale so the longest side = process_res "
-                                       "(caps memory, default). "
-                                       "lower_bound_resize: scale so the shortest side = process_res "
-                                       "(preserves more detail on tall/wide images, uses more memory)."),
-                io.Combo.Input("normalization",
-                               options=["v2_style", "min_max", "raw"],
-                               default="v2_style",
-                               tooltip="How to map raw depth to [0, 1] for the output image. "
-                                       "'raw' preserves absolute values — use this to keep "
-                                       "metric units when running DA3-Metric-Large."),
-                io.Boolean.Input("apply_sky_clip", default=False,
-                                 tooltip="Clip sky-region depth to the 99th percentile before "
-                                         "normalisation. Requires a sky segmentation head "
-                                         "(DA3-Mono-Large or DA3-Metric-Large). "
-                                         "Raises an error on DA3-Small/Base."),
-                io.DynamicCombo.Input("mode",
-                                      tooltip="mono: single image or independent batch — "
-                                              "use with any model. "
-                                              "multiview: all frames processed together with "
-                                              "cross-view attention for geometric consistency; "
-                                              "also outputs camera pose — requires DA3-Small or DA3-Base.",
-                                      options=[
-                    io.DynamicCombo.Option("mono", []),
-                    io.DynamicCombo.Option("multiview", [
-                        io.Combo.Input("ref_view_strategy",
-                                       options=["saddle_balanced", "saddle_sim_range",
-                                                "first", "middle"],
-                                       default="saddle_balanced",
-                                       tooltip="Which view to use as the geometric anchor "
-                                               "(only applied when S >= 3 and no extrinsics "
-                                               "are provided). "
-                                               "saddle_balanced: picks the view whose CLS-token "
-                                               "features are closest to the median across "
-                                               "similarity, norm and variance — best general "
-                                               "choice. "
-                                               "saddle_sim_range: picks the view with the widest "
-                                               "similarity spread to other views — favours "
-                                               "the most distinct viewpoint. "
-                                               "first / middle: deterministic positional fallbacks."),
-                        io.Combo.Input("pose_method",
-                                       options=["cam_dec", "ray_pose"],
-                                       default="cam_dec",
-                                       tooltip="cam_dec: small MLP on the final camera token "
-                                               "(DA3-Small/Base). "
-                                               "ray_pose: RANSAC over the DualDPT ray output "
-                                               "(DA3-Small/Base only)."),
-                    ]),
-                ]),
-            ],
-            outputs=[
-                io.Image.Output("depth_image"),
-                io.Mask.Output("sky_mask",
-                               tooltip="Sky probability mask (Mono/Metric variants). "
-                                       "Zeros for Small/Base."),
-                io.Mask.Output("confidence",
-                               tooltip="Depth confidence (Small/Base variants). "
-                                       "Zeros for Mono/Metric."),
-                io.Latent.Output("camera",
-                                 tooltip="Multi-view: per-view extrinsics + intrinsics + raw depth. "
-                                         "In mono mode this is an empty placeholder."),
-            ],
-        )
-
-    @classmethod
-    def execute(cls, model, image, process_res, resize_method, normalization,
-                apply_sky_clip, mode) -> io.NodeOutput:
-        diffusion = model.model.diffusion_model
-        mode_val = mode["mode"]  # "mono" or "multiview"
-
-        # Capability check for sky clip — fires in both modes.
-        if apply_sky_clip and not diffusion.has_sky:
-            raise ValueError(
-                "apply_sky_clip=True requires a sky segmentation head, but the loaded "
-                "model does not have one. Set apply_sky_clip=False, or load a model "
-                "that includes a sky head (e.g. DA3-Mono-Large or DA3-Metric-Large)."
-            )
-
-        if mode_val == "mono":
-            return cls._execute_mono(
-                model, image, process_res, resize_method,
-                normalization, apply_sky_clip,
-            )
-
-        # Capability checks for multi-view pose.
-        pose_method = mode["pose_method"]
-        ref_view_strategy = mode["ref_view_strategy"]
-
-        if pose_method == "cam_dec" and diffusion.cam_dec is None:
-            raise ValueError(
-                "pose_method='cam_dec' requires a camera decoder, but the loaded "
-                "model does not have one. Load a model with a camera decoder "
-                "(e.g. DA3-Small or DA3-Base), or set pose_method='ray_pose'."
-            )
-        if pose_method == "ray_pose" and diffusion.head_type != "dualdpt":
-            raise ValueError(
-                "pose_method='ray_pose' requires a DualDPT head, but the loaded "
-                "model has a DPT head. Load a model with a DualDPT head "
-                "(e.g. DA3-Small or DA3-Base), or set pose_method='cam_dec'."
-            )
-
-        return cls._execute_multiview(
-            model, image, process_res, resize_method,
-            normalization, apply_sky_clip,
-            ref_view_strategy, pose_method,
-        )
-
-    @staticmethod
-    def _apply_sky_clip(depth: torch.Tensor, sky: torch.Tensor) -> torch.Tensor:
-        return torch.stack([
-            da3_preprocess.apply_sky_aware_clip(depth[i], sky[i])
-            for i in range(depth.shape[0])
-        ], dim=0)
-
-    @staticmethod
-    def _depth_to_image(depth: torch.Tensor, sky_for_norm: torch.Tensor | None,
-                        normalization: str) -> torch.Tensor:
-        """Normalise depth and pack as an (N,H,W,3) image tensor.
-
-        Preserves metric units when normalization is 'raw' (no clamping).
-        """
-        N = depth.shape[0]
-        if normalization == "v2_style":
-            norm = torch.stack([
-                da3_preprocess.normalize_depth_v2_style(
-                    depth[i], sky_for_norm[i] if sky_for_norm is not None else None)
-                for i in range(N)
-            ], dim=0)
-        elif normalization == "min_max":
-            norm = da3_preprocess.normalize_depth_min_max(depth)
-        else:
-            norm = depth
-
-        # Preserve metric units when normalization is raw.
-        out = norm.unsqueeze(-1).repeat(1, 1, 1, 3)
-        if normalization != "raw":
-            out = out.clamp(0.0, 1.0)
-        return out.contiguous()
-
-    @classmethod
-    def _execute_mono(cls, model, image, process_res, resize_method,
-                      normalization, apply_sky_clip) -> io.NodeOutput:
-        depth, confidence, sky = _run_da3(model, image, process_res, method=resize_method)
-
-        if apply_sky_clip and sky is not None:
-            depth = cls._apply_sky_clip(depth, sky)
-
-        out_image = cls._depth_to_image(depth, sky, normalization)
-
-        sky_mask = sky if sky is not None else torch.zeros_like(depth)
-        conf_mask = (_normalize_confidence(confidence)
-                     if confidence is not None else torch.zeros_like(depth))
-        camera = {"samples": torch.zeros(1, 1, 1, 1, 1), "type": "mono"}
-        return io.NodeOutput(
-            out_image,
-            sky_mask.contiguous(),
-            conf_mask.contiguous(),
-            camera,
-        )
-
-    @classmethod
-    def _execute_multiview(cls, model, image, process_res, resize_method,
-                           normalization, apply_sky_clip,
-                           ref_view_strategy, pose_method) -> io.NodeOutput:
-        assert image.ndim == 4 and image.shape[-1] == 3, \
-            f"expected (B,H,W,3) IMAGE; got {tuple(image.shape)}"
-        S, H, W, _ = image.shape
-
-        mm.load_model_gpu(model)
-        diffusion = model.model.diffusion_model
-        device = mm.get_torch_device()
-        dtype = diffusion.dtype if diffusion.dtype is not None else torch.float32
-
-        # All views in a single forward pass: (1, S, 3, H', W').
-        x = image.to(device)
-        x = da3_preprocess.preprocess_image(x, process_res=process_res, method=resize_method)
-        x = x.to(dtype=dtype).unsqueeze(0)
-
-        use_ray_pose = (pose_method == "ray_pose")
-        with torch.no_grad():
-            out = diffusion(x, use_ray_pose=use_ray_pose,
-                            ref_view_strategy=ref_view_strategy)
-
-        depth = torch.nn.functional.interpolate(
-            out["depth"].float().unsqueeze(1), size=(H, W),
-            mode="bilinear", align_corners=False,
-        ).squeeze(1).cpu()
-
-        conf_raw = torch.zeros_like(depth)
-        if "depth_conf" in out:
-            conf_raw = torch.nn.functional.interpolate(
-                out["depth_conf"].unsqueeze(1).float(), size=(H, W),
-                mode="bilinear", align_corners=False,
-            ).squeeze(1).cpu()
-
-        conf_mask = _normalize_confidence(conf_raw) if conf_raw.any() else conf_raw
-
-        sky = None
-        if "sky" in out:
-            sky = torch.nn.functional.interpolate(
-                out["sky"].unsqueeze(1).float(), size=(H, W),
-                mode="bilinear", align_corners=False,
-            ).squeeze(1).cpu()
-
-        if apply_sky_clip and sky is not None:
-            depth = cls._apply_sky_clip(depth, sky)
-
-        if "extrinsics" in out and "intrinsics" in out:
-            extrinsics = out["extrinsics"].float().cpu()
-            intrinsics = out["intrinsics"].float().cpu()
-        else:
-            extrinsics = torch.eye(4)[None, None].expand(1, S, 4, 4).clone()
-            intrinsics = torch.eye(3)[None, None].expand(1, S, 3, 3).clone()
-
-        sky_for_norm = sky if diffusion.has_sky else None
-        depth_image = cls._depth_to_image(depth, sky_for_norm, normalization)
-
-        sky_mask = sky if sky is not None else torch.zeros_like(depth)
-        camera_latent = {
-            "samples": depth.unsqueeze(0).unsqueeze(2).contiguous(),  # (1, S, 1, H, W)
-            "type": "da3_multiview",
-            "extrinsics": extrinsics.contiguous(),
-            "intrinsics": intrinsics.contiguous(),
-            "depth_raw": depth.contiguous(),
-            "confidence": conf_raw.contiguous(),
-        }
-        return io.NodeOutput(
-            depth_image,
-            sky_mask.contiguous(),
-            conf_mask.contiguous(),
-            camera_latent,
-        )
-
-
-class DepthAnything3Extension(ComfyExtension):
-    @override
-    async def get_node_list(self) -> list[type[io.ComfyNode]]:
-        return [
-            LoadDepthAnything3,
-            DepthAnything3,
-        ]
-
-
-async def comfy_entrypoint() -> DepthAnything3Extension:
-    return DepthAnything3Extension()
--- a/comfy_extras/nodes_lt.py
+++ b/comfy_extras/nodes_lt.py
@ -77,7 +77,7 @@ class EmptyLTXVLatentVideo(io.ComfyNode):
    @classmethod
    def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput:
        latent = torch.zeros([batch_size, 128, ((length - 1) // 8) + 1, height // 32, width // 32], device=comfy.model_management.intermediate_device())
-        return io.NodeOutput({"samples": latent})
+        return io.NodeOutput({"samples": latent, "downscale_ratio_spacial": 32})

    generate = execute  # TODO: remove

--- a/comfy_extras/nodes_string.py
+++ b/comfy_extras/nodes_string.py
@ -1,10 +1,41 @@
 import re
 import json
+import string
 from typing_extensions import override

 from comfy_api.latest import ComfyExtension, io


+class StringFormat(io.ComfyNode):
+    @classmethod
+    def define_schema(cls) -> io.Schema:
+        autogrow = io.Autogrow.TemplateNames(
+            input=io.AnyType.Input("value"),
+            names=list(string.ascii_lowercase),
+            min=0,
+        )
+        return io.Schema(
+            node_id="StringFormat",
+            display_name="Format Text",
+            category="text",
+            search_aliases=["string", "format"],
+            description="Same as Python's string format method. Supports all of Python's format options and features.",
+            inputs=[
+                io.Autogrow.Input("values", template=autogrow),
+                io.String.Input("f_string", default="{a}", multiline=True),
+            ],
+            outputs=[
+                io.String.Output(),
+            ],
+        )
+
+    @classmethod
+    def execute(
+        cls, values: io.Autogrow.Type, f_string: str
+    ) -> io.NodeOutput:
+        return io.NodeOutput(f_string.format(**values))
+
+
 class StringConcatenate(io.ComfyNode):
    @classmethod
    def define_schema(cls):
@ -413,6 +444,7 @@ class StringExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[io.ComfyNode]]:
        return [
+            StringFormat,
            StringConcatenate,
            StringSubstring,
            StringLength,
--- a/folder_paths.py
+++ b/folder_paths.py
@ -60,7 +60,6 @@ folder_names_and_paths["geometry_estimation"] = ([os.path.join(models_dir, "geom

 folder_names_and_paths["optical_flow"] = ([os.path.join(models_dir, "optical_flow")], supported_pt_extensions)

-
 output_directory = os.path.join(base_path, "output")
 temp_directory = os.path.join(base_path, "temp")
 input_directory = os.path.join(base_path, "input")
--- a/nodes.py
+++ b/nodes.py
@ -2444,7 +2444,6 @@ async def init_builtin_extra_nodes():
        "nodes_hidream_o1.py",
        "nodes_save_3d.py",
        "nodes_moge.py",
-        "nodes_depth_anything_3.py",
    ]

    import_failed = []
--- a/openapi.yaml
+++ b/openapi.yaml
@ -4160,6 +4160,10 @@ paths:
                name:
                  type: string
                  description: Display name for the API key
+                description:
+                  type: string
+                  description: User-provided description of the key's purpose
+                  maxLength: 5000
      responses:
        "201":
          description: API key created
@ -6351,14 +6355,6 @@ components:
          type: integer
          format: int64
          description: Size of the asset in bytes
-        width:
-          type: integer
-          nullable: true
-          description: "Original image width in pixels. Null for non-image assets or assets ingested before dimension extraction."
-        height:
-          type: integer
-          nullable: true
-          description: "Original image height in pixels. Null for non-image assets or assets ingested before dimension extraction."
        mime_type:
          type: string
          description: MIME type of the asset
@ -7685,11 +7681,16 @@ components:
      required:
        - id
        - name
+        - description
      properties:
        id:
          type: string
        name:
          type: string
+        description:
+          type: string
+          maxLength: 5000
+          description: User-provided description of the key's purpose. Always present in responses; empty string when no description was supplied on create.
        prefix:
          type: string
          description: First few characters of the key for identification
@ -7710,12 +7711,17 @@ components:
      required:
        - id
        - name
+        - description
        - key
      properties:
        id:
          type: string
        name:
          type: string
+        description:
+          type: string
+          maxLength: 5000
+          description: User-provided description of the key's purpose. Always present in responses; empty string when no description was supplied on create.
        key:
          type: string
          description: Full API key value (only returned on creation)
Author	SHA1	Message	Date
bigcat88	8956e00c16	[Partner Nodes] fix passing images to Grok LLM Signed-off-by: bigcat88 <bigcat88@icloud.com>	2026-05-20 16:33:39 +03:00
bigcat88	3bdf87ffa9	[Partner Nodes] add new OpenRouterLLM node Signed-off-by: bigcat88 <bigcat88@icloud.com>	2026-05-20 15:22:59 +03:00
bigcat88	0a6e3c9d70	[Partner Nodes] add reasoning widget to Anthropic node Signed-off-by: bigcat88 <bigcat88@icloud.com>	2026-05-20 15:22:58 +03:00
Cezarijus Kivylius	78b5dec6b6	fix: Hunyuan3D 2.1 batch size crashes in attention and forward pass (#13699 )	2026-05-20 19:58:49 +08:00
comfyanonymous	72e3f6081c	Add downscale ratio to empty ltxv latent. (#13999 )	2026-05-19 20:28:06 -07:00
Pauan	7ec7b6ffe9	Adding new StringFormat node (#13997 )	2026-05-20 10:25:49 +08:00
Matt Miller	6887165a9d	docs(openapi): tighten workspace API key description field (BE-1004) (#13996 ) Aligns the OSS spec with the cloud-side BE-1004 contract: - createWorkspaceApiKey request body: add maxLength: 5000 to the description property (matches cloud's hub_profile.description MaxLen(5000) convention; enforced cloud-side via handler check). - WorkspaceApiKey + WorkspaceApiKeyCreated response schemas: mark description as required (cloud's handler always populates the field, defaulting to empty string when not supplied on create), drop nullable: true, add maxLength: 5000 for symmetry, and clarify the doc string ("Always present in responses; empty string when no description was supplied on create"). Both schemas are tagged x-runtime: [cloud] at the schema level so the tightening is correctly scoped — OSS-only implementations are not required to honor the workspace API keys endpoints at all. Related cloud PR: Comfy-Org/cloud#3747	2026-05-19 16:55:04 -07:00
Matt Miller	cc4d711eb1	feat(openapi): add optional description field to workspace API key schemas (#13993 ) * feat(openapi): add optional description field to workspace API key schemas Add an optional `description` property (type: string) to three workspace API key schemas in openapi.yaml: - Inline request body of createWorkspaceApiKey (POST /api/workspace/api-keys) - WorkspaceApiKey (list/info schema) - WorkspaceApiKeyCreated (creation response schema) The field is not added to any `required` array, making it fully backward-compatible with existing clients. Refs: BE-1005, BE-1004 Co-authored-by: Matt Miller <mattmillerai@users.noreply.github.com> * fix(openapi): mark description nullable in workspace API key response schemas Per CodeRabbit review on PR #13993: the underlying DB column is nullable varchar (default ''), so the response schemas should permit null to match stored data reality. Without nullable: true the OpenAPI contract would require coercion on the handler side or risk a contract violation. Request schema unchanged — clients shouldn't be sending null on create.	2026-05-19 14:48:47 -07:00
yy	626b082838	Fix typo in ops.py (#11925 )	2026-05-20 05:45:04 +08:00
Matt Miller	d0328b442d	docs(openapi): remove top-level width/height fields on Asset schema (#13973 ) These two fields were added recently to the Asset schema as nullable integers, with the intent of exposing original image dimensions for FE consumers (cloud-side thumbnailing makes naturalWidth/Height return the wrong size for an image card's dimension label). The implementation effort that consumes them subsequently converged on a different shape — dimensions nested under the existing free-form `metadata` JSON field as `{kind: "image", width, height}` — to avoid introducing type-specific flat fields on the canonical Asset shape, and to leave room for forward-compatible additions (video duration, fps, etc.) without further schema churn. This removes the now-unused top-level fields so the spec reflects the agreed direction. No other schema definitions reference these fields directly: AssetCreated, AssetUpdated, etc. inherit Asset via allOf and do not redefine them. The runtime ingest implementation that would have populated these fields was not yet shipped, so no clients are relying on the top-level shape. Co-authored-by: Alexis Rolland <alexisrolland@hotmail.com>	2026-05-19 10:00:26 -07:00