mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-06-18 12:37:58 +08:00
Compare commits
4 Commits
ci/cursor-
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| f2270f070a | |||
| 191a75a2cd | |||
| 52257bb435 | |||
| e25c391888 |
37
.github/workflows/ci-cursor-review.yml
vendored
37
.github/workflows/ci-cursor-review.yml
vendored
@ -1,37 +0,0 @@
|
||||
name: CI - Cursor Review
|
||||
|
||||
# Thin caller for the shared reusable cursor-review workflow in
|
||||
# Comfy-Org/github-workflows. The review logic (panel matrix, judge
|
||||
# consolidation, prompts, extract/post/notify scripts) lives there as the
|
||||
# single source of truth, so this repo only carries the repo-specific diff
|
||||
# excludes.
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types: [labeled, unlabeled]
|
||||
|
||||
concurrency:
|
||||
group: cursor-review-pr-${{ github.event.pull_request.number }}-${{ github.event.label.name }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
cursor-review:
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
# SHA-pinned per zizmor `unpinned-uses: hash-pin`. Bump this SHA to pick up
|
||||
# upstream changes; keep `workflows_ref` matching so prompts/scripts load
|
||||
# from the same commit as the workflow definition.
|
||||
uses: Comfy-Org/github-workflows/.github/workflows/cursor-review.yml@047ca48febe3a6647608ed2e0c4331b491cb9d6a # github-workflows#9
|
||||
with:
|
||||
workflows_ref: 047ca48febe3a6647608ed2e0c4331b491cb9d6a
|
||||
diff_excludes: >-
|
||||
:!**/.claude/**
|
||||
:!**/dist/**
|
||||
:!**/vendor/**
|
||||
:!**/*.generated.*
|
||||
:!**/*.min.js
|
||||
:!**/*.min.css
|
||||
secrets:
|
||||
CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }}
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
321
comfy/ldm/boogu/model.py
Normal file
321
comfy/ldm/boogu/model.py
Normal file
@ -0,0 +1,321 @@
|
||||
# Boogu-Image-0.1 transformer
|
||||
# Architecture is an OmniGen2 derivative (see comfy/ldm/omnigen/omnigen2.py) with an
|
||||
# added dual-stream ("double_stream") stage before the single-stream layers, conditioned
|
||||
# by a Qwen3-VL multimodal LLM. Reuses the OmniGen2/Lumina building blocks and the Flux
|
||||
# RoPE core, the only new component is the double-stream block + the hybrid forward order.
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from einops import rearrange
|
||||
|
||||
import comfy.ldm.common_dit
|
||||
import comfy.ldm.omnigen.omnigen2
|
||||
from comfy.ldm.modules.attention import optimized_attention_masked
|
||||
from comfy.ldm.omnigen.omnigen2 import (
|
||||
OmniGen2RotaryPosEmbed,
|
||||
Lumina2CombinedTimestepCaptionEmbedding,
|
||||
LuminaRMSNormZero,
|
||||
LuminaLayerNormContinuous,
|
||||
LuminaFeedForward,
|
||||
Attention,
|
||||
OmniGen2TransformerBlock,
|
||||
apply_rotary_emb,
|
||||
)
|
||||
|
||||
class BooguDoubleStreamProcessor(nn.Module):
|
||||
# Joint attention over [instruct ; img] with separate per-stream q/k/v and output projections.
|
||||
def __init__(self, dim, head_dim, heads, kv_heads, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
query_dim = head_dim * heads
|
||||
kv_dim = head_dim * kv_heads
|
||||
|
||||
self.img_to_q = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
|
||||
self.img_to_k = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
|
||||
self.img_to_v = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
|
||||
|
||||
self.instruct_to_q = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
|
||||
self.instruct_to_k = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
|
||||
self.instruct_to_v = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
|
||||
|
||||
self.instruct_out = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
|
||||
self.img_out = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, attn, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask=None, transformer_options={}):
|
||||
batch_size = img_hidden_states.shape[0]
|
||||
L_instruct = instruct_hidden_states.shape[1]
|
||||
|
||||
img_q = self.img_to_q(img_hidden_states)
|
||||
img_k = self.img_to_k(img_hidden_states)
|
||||
img_v = self.img_to_v(img_hidden_states)
|
||||
|
||||
instruct_q = self.instruct_to_q(instruct_hidden_states)
|
||||
instruct_k = self.instruct_to_k(instruct_hidden_states)
|
||||
instruct_v = self.instruct_to_v(instruct_hidden_states)
|
||||
|
||||
# Concatenate instruction first, then image (matches reference processor order).
|
||||
query = torch.cat([instruct_q, img_q], dim=1)
|
||||
key = torch.cat([instruct_k, img_k], dim=1)
|
||||
value = torch.cat([instruct_v, img_v], dim=1)
|
||||
|
||||
query = query.view(batch_size, -1, attn.heads, attn.dim_head)
|
||||
key = key.view(batch_size, -1, attn.kv_heads, attn.dim_head)
|
||||
value = value.view(batch_size, -1, attn.kv_heads, attn.dim_head)
|
||||
|
||||
query = attn.norm_q(query)
|
||||
key = attn.norm_k(key)
|
||||
|
||||
if rotary_emb is not None:
|
||||
query = apply_rotary_emb(query, rotary_emb)
|
||||
key = apply_rotary_emb(key, rotary_emb)
|
||||
|
||||
query = query.transpose(1, 2)
|
||||
key = key.transpose(1, 2)
|
||||
value = value.transpose(1, 2)
|
||||
|
||||
if attn.kv_heads < attn.heads:
|
||||
key = key.repeat_interleave(attn.heads // attn.kv_heads, dim=1)
|
||||
value = value.repeat_interleave(attn.heads // attn.kv_heads, dim=1)
|
||||
|
||||
hidden_states = optimized_attention_masked(query, key, value, attn.heads, attention_mask, skip_reshape=True, transformer_options=transformer_options)
|
||||
|
||||
# Split back to instruction/image, apply per-stream output projections, recombine.
|
||||
instruct_hidden_states = self.instruct_out(hidden_states[:, :L_instruct])
|
||||
img_hidden_states = self.img_out(hidden_states[:, L_instruct:])
|
||||
hidden_states = torch.cat([instruct_hidden_states, img_hidden_states], dim=1)
|
||||
|
||||
hidden_states = attn.to_out[0](hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class BooguJointAttention(nn.Module):
|
||||
# Holds the shared q/k RMSNorm + final output projection
|
||||
def __init__(self, dim, head_dim, heads, kv_heads, eps=1e-5, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.heads = heads
|
||||
self.kv_heads = kv_heads
|
||||
self.dim_head = head_dim
|
||||
self.scale = head_dim ** -0.5
|
||||
|
||||
self.norm_q = operations.RMSNorm(head_dim, eps=eps, dtype=dtype, device=device)
|
||||
self.norm_k = operations.RMSNorm(head_dim, eps=eps, dtype=dtype, device=device)
|
||||
self.to_out = nn.Sequential(
|
||||
operations.Linear(heads * head_dim, dim, bias=False, dtype=dtype, device=device),
|
||||
nn.Dropout(0.0),
|
||||
)
|
||||
self.processor = BooguDoubleStreamProcessor(dim, head_dim, heads, kv_heads, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
def forward(self, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask=None, transformer_options={}):
|
||||
return self.processor(self, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask, transformer_options=transformer_options)
|
||||
|
||||
|
||||
class BooguDoubleStreamBlock(nn.Module):
|
||||
# Dual-stream block: joint attention over [instruct ; img] + image self-attention, each stream with its own modulation/MLP.
|
||||
def __init__(self, dim, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
head_dim = dim // num_attention_heads
|
||||
|
||||
self.img_instruct_attn = BooguJointAttention(dim, head_dim, num_attention_heads, num_kv_heads, eps=1e-5, dtype=dtype, device=device, operations=operations)
|
||||
self.img_self_attn = Attention(
|
||||
query_dim=dim, dim_head=head_dim, heads=num_attention_heads, kv_heads=num_kv_heads,
|
||||
eps=1e-5, bias=False, dtype=dtype, device=device, operations=operations,
|
||||
)
|
||||
|
||||
self.img_feed_forward = LuminaFeedForward(dim=dim, inner_dim=4 * dim, multiple_of=multiple_of, dtype=dtype, device=device, operations=operations)
|
||||
self.instruct_feed_forward = LuminaFeedForward(dim=dim, inner_dim=4 * dim, multiple_of=multiple_of, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
self.img_norm1 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
|
||||
self.img_norm2 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
|
||||
self.img_norm3 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
|
||||
self.instruct_norm1 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
|
||||
self.instruct_norm2 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
self.img_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
|
||||
self.img_self_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
|
||||
self.img_ffn_norm1 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
|
||||
self.img_ffn_norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
|
||||
|
||||
self.instruct_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
|
||||
self.instruct_ffn_norm1 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
|
||||
self.instruct_ffn_norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, img_hidden_states, instruct_hidden_states, joint_rotary_emb, img_rotary_emb, temb, joint_attention_mask=None, img_attention_mask=None, transformer_options={}):
|
||||
L_instruct = instruct_hidden_states.shape[1]
|
||||
|
||||
img_norm1_out, img_gate_msa, img_scale_mlp, img_gate_mlp = self.img_norm1(img_hidden_states, temb)
|
||||
img_norm2_out, img_shift_mlp, _, _ = self.img_norm2(img_hidden_states, temb)
|
||||
img_norm3_out, img_gate_self, _, _ = self.img_norm3(img_hidden_states, temb)
|
||||
|
||||
instruct_norm1_out, instruct_gate_msa, instruct_scale_mlp, instruct_gate_mlp = self.instruct_norm1(instruct_hidden_states, temb)
|
||||
instruct_norm2_out, instruct_shift_mlp, _, _ = self.instruct_norm2(instruct_hidden_states, temb)
|
||||
|
||||
joint_attn_out = self.img_instruct_attn(img_norm1_out, instruct_norm1_out, joint_rotary_emb, joint_attention_mask, transformer_options=transformer_options)
|
||||
instruct_attn_out = joint_attn_out[:, :L_instruct]
|
||||
img_attn_out = joint_attn_out[:, L_instruct:]
|
||||
|
||||
img_self_attn_out = self.img_self_attn(img_norm3_out, img_norm3_out, img_attention_mask, img_rotary_emb, transformer_options=transformer_options)
|
||||
|
||||
img_hidden_states = img_hidden_states + img_gate_msa.unsqueeze(1).tanh() * self.img_attn_norm(img_attn_out)
|
||||
img_hidden_states = img_hidden_states + img_gate_self.unsqueeze(1).tanh() * self.img_self_attn_norm(img_self_attn_out)
|
||||
img_mlp_input = (1 + img_scale_mlp.unsqueeze(1)) * img_norm2_out + img_shift_mlp.unsqueeze(1)
|
||||
img_mlp_out = self.img_feed_forward(self.img_ffn_norm1(img_mlp_input))
|
||||
img_hidden_states = img_hidden_states + img_gate_mlp.unsqueeze(1).tanh() * self.img_ffn_norm2(img_mlp_out)
|
||||
|
||||
instruct_hidden_states = instruct_hidden_states + instruct_gate_msa.unsqueeze(1).tanh() * self.instruct_attn_norm(instruct_attn_out)
|
||||
instruct_mlp_input = (1 + instruct_scale_mlp.unsqueeze(1)) * instruct_norm2_out + instruct_shift_mlp.unsqueeze(1)
|
||||
instruct_mlp_out = self.instruct_feed_forward(self.instruct_ffn_norm1(instruct_mlp_input))
|
||||
instruct_hidden_states = instruct_hidden_states + instruct_gate_mlp.unsqueeze(1).tanh() * self.instruct_ffn_norm2(instruct_mlp_out)
|
||||
|
||||
return img_hidden_states, instruct_hidden_states
|
||||
|
||||
|
||||
class BooguTransformer2DModel(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
patch_size: int = 2,
|
||||
in_channels: int = 16,
|
||||
out_channels: Optional[int] = None,
|
||||
hidden_size: int = 3360,
|
||||
num_layers: int = 32,
|
||||
num_double_stream_layers: int = 8,
|
||||
num_refiner_layers: int = 2,
|
||||
num_attention_heads: int = 28,
|
||||
num_kv_heads: int = 7,
|
||||
multiple_of: int = 256,
|
||||
ffn_dim_multiplier: Optional[float] = None,
|
||||
norm_eps: float = 1e-5,
|
||||
axes_dim_rope: Tuple[int, int, int] = (40, 40, 40),
|
||||
axes_lens: Tuple[int, int, int] = (2048, 1664, 1664),
|
||||
instruction_feat_dim: int = 4096,
|
||||
timestep_scale: float = 1000.0,
|
||||
image_model=None,
|
||||
device=None, dtype=None, operations=None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.patch_size = patch_size
|
||||
self.out_channels = out_channels or in_channels
|
||||
self.hidden_size = hidden_size
|
||||
self.dtype = dtype
|
||||
|
||||
self.rope_embedder = OmniGen2RotaryPosEmbed(
|
||||
theta=10000,
|
||||
axes_dim=axes_dim_rope,
|
||||
axes_lens=axes_lens,
|
||||
patch_size=patch_size,
|
||||
)
|
||||
|
||||
self.x_embedder = operations.Linear(patch_size * patch_size * in_channels, hidden_size, dtype=dtype, device=device)
|
||||
self.ref_image_patch_embedder = operations.Linear(patch_size * patch_size * in_channels, hidden_size, dtype=dtype, device=device)
|
||||
|
||||
self.time_caption_embed = Lumina2CombinedTimestepCaptionEmbedding(
|
||||
hidden_size=hidden_size,
|
||||
text_feat_dim=instruction_feat_dim,
|
||||
norm_eps=norm_eps,
|
||||
timestep_scale=timestep_scale, dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
|
||||
self.noise_refiner = nn.ModuleList([
|
||||
OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations)
|
||||
for _ in range(num_refiner_layers)
|
||||
])
|
||||
|
||||
self.ref_image_refiner = nn.ModuleList([
|
||||
OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations)
|
||||
for _ in range(num_refiner_layers)
|
||||
])
|
||||
|
||||
self.context_refiner = nn.ModuleList([
|
||||
OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=False, dtype=dtype, device=device, operations=operations)
|
||||
for _ in range(num_refiner_layers)
|
||||
])
|
||||
|
||||
self.double_stream_layers = nn.ModuleList([
|
||||
BooguDoubleStreamBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, dtype=dtype, device=device, operations=operations)
|
||||
for _ in range(num_double_stream_layers)
|
||||
])
|
||||
|
||||
self.single_stream_layers = nn.ModuleList([
|
||||
OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
self.norm_out = LuminaLayerNormContinuous(
|
||||
embedding_dim=hidden_size,
|
||||
conditioning_embedding_dim=min(hidden_size, 1024),
|
||||
elementwise_affine=False,
|
||||
eps=1e-6,
|
||||
out_dim=patch_size * patch_size * self.out_channels, dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
|
||||
self.image_index_embedding = nn.Parameter(torch.empty(5, hidden_size, device=device, dtype=dtype))
|
||||
|
||||
# Patchify/refine helpers are identical to OmniGen2; reuse via bound methods.
|
||||
flat_and_pad_to_seq = comfy.ldm.omnigen.omnigen2.OmniGen2Transformer2DModel.flat_and_pad_to_seq
|
||||
img_patch_embed_and_refine = comfy.ldm.omnigen.omnigen2.OmniGen2Transformer2DModel.img_patch_embed_and_refine
|
||||
|
||||
def forward(self, x, timesteps, context, num_tokens, ref_latents=None, attention_mask=None, transformer_options={}, **kwargs):
|
||||
B, C, H, W = x.shape
|
||||
hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
|
||||
_, _, H_padded, W_padded = hidden_states.shape
|
||||
timestep = 1.0 - timesteps
|
||||
text_hidden_states = context
|
||||
text_attention_mask = attention_mask
|
||||
ref_image_hidden_states = ref_latents
|
||||
device = hidden_states.device
|
||||
|
||||
temb, text_hidden_states = self.time_caption_embed(timestep, text_hidden_states, hidden_states[0].dtype)
|
||||
|
||||
(
|
||||
hidden_states, ref_image_hidden_states,
|
||||
img_mask, ref_img_mask,
|
||||
l_effective_ref_img_len, l_effective_img_len,
|
||||
ref_img_sizes, img_sizes,
|
||||
) = self.flat_and_pad_to_seq(hidden_states, ref_image_hidden_states)
|
||||
|
||||
(
|
||||
context_rotary_emb, ref_img_rotary_emb, noise_rotary_emb,
|
||||
rotary_emb, encoder_seq_lengths, seq_lengths,
|
||||
) = self.rope_embedder(
|
||||
hidden_states.shape[0], text_hidden_states.shape[1], [num_tokens] * text_hidden_states.shape[0],
|
||||
l_effective_ref_img_len, l_effective_img_len,
|
||||
ref_img_sizes, img_sizes, device,
|
||||
)
|
||||
|
||||
for layer in self.context_refiner:
|
||||
text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb, transformer_options=transformer_options)
|
||||
|
||||
img_len = hidden_states.shape[1]
|
||||
combined_img_hidden_states = self.img_patch_embed_and_refine(
|
||||
hidden_states, ref_image_hidden_states,
|
||||
img_mask, ref_img_mask,
|
||||
noise_rotary_emb, ref_img_rotary_emb,
|
||||
l_effective_ref_img_len, l_effective_img_len,
|
||||
temb,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
|
||||
# Double-stream stage: the image self-attention only sees the [ref ; noise] tokens,
|
||||
# which sit after the instruction tokens in the joint rope.
|
||||
L_instruct = text_hidden_states.shape[1]
|
||||
combined_img_rotary_emb = rotary_emb[:, L_instruct:]
|
||||
for layer in self.double_stream_layers:
|
||||
combined_img_hidden_states, text_hidden_states = layer(
|
||||
combined_img_hidden_states, text_hidden_states,
|
||||
rotary_emb, combined_img_rotary_emb, temb,
|
||||
joint_attention_mask=None, img_attention_mask=None,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
|
||||
hidden_states = torch.cat([text_hidden_states, combined_img_hidden_states], dim=1)
|
||||
|
||||
for layer in self.single_stream_layers:
|
||||
hidden_states = layer(hidden_states, None, rotary_emb, temb, transformer_options=transformer_options)
|
||||
|
||||
hidden_states = self.norm_out(hidden_states, temb)
|
||||
|
||||
p = self.patch_size
|
||||
output = rearrange(hidden_states[:, -img_len:], 'b (h w) (p1 p2 c) -> b c (h p1) (w p2)', h=H_padded // p, w=W_padded // p, p1=p, p2=p)[:, :, :H, :W]
|
||||
|
||||
return -output
|
||||
@ -22,7 +22,7 @@ def apply_rotary_emb(x, freqs_cis):
|
||||
|
||||
|
||||
def swiglu(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
|
||||
return F.silu(x) * y
|
||||
return F.silu(x, inplace=True).mul_(y)
|
||||
|
||||
|
||||
class TimestepEmbedding(nn.Module):
|
||||
|
||||
@ -54,6 +54,7 @@ import comfy.ldm.pixeldit.model
|
||||
import comfy.ldm.pixeldit.pid
|
||||
import comfy.ldm.ace.model
|
||||
import comfy.ldm.omnigen.omnigen2
|
||||
import comfy.ldm.boogu.model
|
||||
import comfy.ldm.qwen_image.model
|
||||
import comfy.ldm.ideogram4.model
|
||||
import comfy.ldm.kandinsky5.model
|
||||
@ -2103,6 +2104,11 @@ class Omnigen2(BaseModel):
|
||||
out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16])
|
||||
return out
|
||||
|
||||
class Boogu(Omnigen2):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||
super(Omnigen2, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.boogu.model.BooguTransformer2DModel)
|
||||
self.memory_usage_factor_conds = ("ref_latents",)
|
||||
|
||||
class QwenImage(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.qwen_image.model.QwenImageTransformer2DModel)
|
||||
|
||||
@ -761,6 +761,16 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
|
||||
return dit_config
|
||||
|
||||
if '{}double_stream_layers.0.img_instruct_attn.processor.img_to_q.weight'.format(key_prefix) in state_dict_keys: # Boogu-Image (OmniGen2 derivative + dual-stream stage)
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "boogu"
|
||||
dit_config["hidden_size"] = state_dict['{}x_embedder.weight'.format(key_prefix)].shape[0]
|
||||
dit_config["num_layers"] = count_blocks(state_dict_keys, '{}single_stream_layers.'.format(key_prefix) + '{}.')
|
||||
dit_config["num_double_stream_layers"] = count_blocks(state_dict_keys, '{}double_stream_layers.'.format(key_prefix) + '{}.')
|
||||
dit_config["num_refiner_layers"] = count_blocks(state_dict_keys, '{}noise_refiner.'.format(key_prefix) + '{}.')
|
||||
dit_config["instruction_feat_dim"] = state_dict['{}time_caption_embed.caption_embedder.0.weight'.format(key_prefix)].shape[0]
|
||||
return dit_config
|
||||
|
||||
if '{}time_caption_embed.timestep_embedder.linear_1.bias'.format(key_prefix) in state_dict_keys: # Omnigen2
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "omnigen2"
|
||||
|
||||
@ -68,6 +68,7 @@ import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
import comfy.text_encoders.qwen35
|
||||
import comfy.text_encoders.qwen3vl
|
||||
import comfy.text_encoders.boogu
|
||||
import comfy.text_encoders.ernie
|
||||
import comfy.text_encoders.gemma4
|
||||
import comfy.text_encoders.cogvideo
|
||||
@ -1301,6 +1302,7 @@ class CLIPType(Enum):
|
||||
LENS = 28
|
||||
PIXELDIT = 29
|
||||
IDEOGRAM4 = 30
|
||||
BOOGU = 31
|
||||
|
||||
|
||||
|
||||
@ -1622,6 +1624,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
|
||||
clip_target.clip = comfy.text_encoders.ideogram4.te_qwen3vl(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.ideogram4.Ideogram4Qwen3VLTokenizer
|
||||
elif clip_type == CLIPType.BOOGU and te_model == TEModel.QWEN3VL_8B: # Boogu-Image: full Qwen3-VL-8B, last hidden state, no-think template.
|
||||
clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
|
||||
clip_target.clip = comfy.text_encoders.boogu.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.boogu.BooguTokenizer
|
||||
elif clip_type in (CLIPType.FLUX, CLIPType.FLUX2): # Flux2 Klein reuses the Qwen3-VL LM (3-layer tap -> 12288); visual unused.
|
||||
klein_model_type = "qwen3_8b" if te_model == TEModel.QWEN3VL_8B else "qwen3_4b"
|
||||
clip_target.clip = comfy.text_encoders.flux.klein_te(**llama_detect(clip_data), model_type=klein_model_type)
|
||||
|
||||
@ -25,6 +25,7 @@ import comfy.text_encoders.hunyuan_image
|
||||
import comfy.text_encoders.kandinsky5
|
||||
import comfy.text_encoders.z_image
|
||||
import comfy.text_encoders.ideogram4
|
||||
import comfy.text_encoders.boogu
|
||||
import comfy.text_encoders.anima
|
||||
import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
@ -1758,6 +1759,27 @@ class Omnigen2(supported_models_base.BASE):
|
||||
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref))
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect))
|
||||
|
||||
class Boogu(Omnigen2):
|
||||
unet_config = {
|
||||
"image_model": "boogu",
|
||||
}
|
||||
|
||||
sampling_settings = {
|
||||
"multiplier": 1.0,
|
||||
"shift": 3.16,
|
||||
}
|
||||
|
||||
memory_usage_factor = 2.15
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
out = model_base.Boogu(self, device=device)
|
||||
return out
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
pref = self.text_encoder_key_prefix[0]
|
||||
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3vl_8b.transformer.".format(pref))
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.boogu.BooguTokenizer, comfy.text_encoders.boogu.te(**hunyuan_detect))
|
||||
|
||||
class Ideogram4(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "ideogram4",
|
||||
@ -2300,6 +2322,7 @@ models = [
|
||||
ACEStep,
|
||||
ACEStep15,
|
||||
Omnigen2,
|
||||
Boogu,
|
||||
QwenImage,
|
||||
Ideogram4,
|
||||
Flux2,
|
||||
|
||||
58
comfy/text_encoders/boogu.py
Normal file
58
comfy/text_encoders/boogu.py
Normal file
@ -0,0 +1,58 @@
|
||||
"""Boogu-Image text encoder: full Qwen3-VL-8B, last hidden state (4096-dim).
|
||||
|
||||
Boogu uses the final hidden state of Qwen3-VL as the per-token instruction feature
|
||||
(num_instruction_feature_layers=1, reduce_type=mean -> just the last layer).
|
||||
The model itself is the standard Qwen3-VL TE, only the chat template differs
|
||||
(a fixed system prompt and no <think> block).
|
||||
"""
|
||||
|
||||
import comfy.text_encoders.qwen3vl
|
||||
from comfy import sd1_clip
|
||||
|
||||
|
||||
# System prompts from the reference pipeline (pipeline_boogu.py).
|
||||
# T2I (non-empty instruction, no image) uses the helpful-assistant prompt
|
||||
# everything else (the CFG negative / "drop" condition, and any image case) uses the TI2I "describe" prompt.
|
||||
BOOGU_T2I_SYSTEM = "You are a helpful assistant that generates high-quality images based on user instructions. The instructions are as follows."
|
||||
BOOGU_DROP_SYSTEM = "Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate."
|
||||
|
||||
|
||||
class BooguTokenizer(comfy.text_encoders.qwen3vl.Qwen3VLTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, model_type="qwen3vl_8b")
|
||||
# apply_chat_template without add_generation_prompt
|
||||
self.llama_template = "<|im_start|>system\n" + BOOGU_T2I_SYSTEM + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n"
|
||||
self.llama_template_images = "<|im_start|>system\n" + BOOGU_DROP_SYSTEM + "<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n"
|
||||
# Reference SYSTEM_PROMPT_DROP: used for the empty negative/uncond instruction.
|
||||
self.llama_template_drop = "<|im_start|>system\n" + BOOGU_DROP_SYSTEM + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n"
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=True, **kwargs):
|
||||
if llama_template is None and len(images) == 0 and text.strip() == "":
|
||||
llama_template = self.llama_template_drop
|
||||
# Boogu conditions on the no-think template; thinking=True drops the empty <think> block qwen3vl adds by default.
|
||||
return super().tokenize_with_weights(text, return_word_ids=return_word_ids, llama_template=llama_template, images=images, prevent_empty_text=prevent_empty_text, thinking=thinking, **kwargs)
|
||||
|
||||
|
||||
class BooguQwen3VLClipModel(comfy.text_encoders.qwen3vl.Qwen3VLClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, attention_mask=True, model_options={}, model_type="qwen3vl_8b"):
|
||||
super().__init__(device=device, dtype=dtype, attention_mask=attention_mask, model_options=model_options, model_type=model_type)
|
||||
# apply the final RMSNorm to the tapped last layer
|
||||
self.layer_norm_hidden_state = True
|
||||
|
||||
|
||||
class BooguTEModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
clip_model = lambda **kw: BooguQwen3VLClipModel(**kw, model_type="qwen3vl_8b")
|
||||
super().__init__(device=device, dtype=dtype, name="qwen3vl_8b", clip_model=clip_model, model_options=model_options)
|
||||
|
||||
|
||||
def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||
class BooguTEModel_(BooguTEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
return BooguTEModel_
|
||||
@ -25,6 +25,11 @@ CLI_FEATURE_FLAG_REGISTRY: dict[str, FeatureFlagInfo] = {
|
||||
"default": False,
|
||||
"description": "Show the sign-in button in the frontend even when not signed in",
|
||||
},
|
||||
"enable_telemetry": {
|
||||
"type": "bool",
|
||||
"default": False,
|
||||
"description": "Signal the frontend that telemetry collection is enabled",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -149,3 +149,59 @@ class MotionControlRequest(BaseModel):
|
||||
character_orientation: str = Field(...)
|
||||
mode: str = Field(..., description="'pro' or 'std'")
|
||||
model_name: str = Field(...)
|
||||
|
||||
|
||||
class Kling3TurboSettings(BaseModel):
|
||||
resolution: str = Field("720p", description="'720p' or '1080p'")
|
||||
aspect_ratio: str | None = Field(None, description="'16:9'/'9:16'/'1:1'; text-to-video only")
|
||||
duration: int = Field(5, description="3-15 second")
|
||||
|
||||
|
||||
class Kling3TurboText2VideoRequest(BaseModel):
|
||||
prompt: str = Field(..., description="<=3072 chars; may use multi-shot 'shot n, m, words; ...'")
|
||||
settings: Kling3TurboSettings | None = Field(None)
|
||||
|
||||
|
||||
class Kling3TurboContent(BaseModel):
|
||||
type: str = Field(..., description="'prompt' or 'first_frame'")
|
||||
text: str | None = Field(None, description="for type=prompt; <=2500 chars")
|
||||
url: str | None = Field(None, description="for type=first_frame")
|
||||
|
||||
|
||||
class Kling3TurboImage2VideoRequest(BaseModel):
|
||||
contents: list[Kling3TurboContent] = Field(..., description="prompt + first_frame materials")
|
||||
settings: Kling3TurboSettings | None = Field(None)
|
||||
|
||||
|
||||
class Kling3TurboCreateData(BaseModel):
|
||||
id: str | None = Field(None, description="Task ID")
|
||||
status: str | None = Field(None)
|
||||
message: str | None = Field(None)
|
||||
|
||||
|
||||
class Kling3TurboCreateResponse(BaseModel):
|
||||
code: int | None = Field(None)
|
||||
message: str | None = Field(None)
|
||||
request_id: str | None = Field(None)
|
||||
data: Kling3TurboCreateData | None = Field(None)
|
||||
|
||||
|
||||
class Kling3TurboOutput(BaseModel):
|
||||
type: str | None = Field(None, description="'video', 'image', 'audio', ...")
|
||||
id: str | None = Field(None)
|
||||
url: str | None = Field(None)
|
||||
duration: str | None = Field(None)
|
||||
|
||||
|
||||
class Kling3TurboTaskData(BaseModel):
|
||||
id: str | None = Field(None)
|
||||
status: str | None = Field(None, description="submitted | processing | succeeded | failed")
|
||||
message: str | None = Field(None)
|
||||
outputs: list[Kling3TurboOutput] | None = Field(None)
|
||||
|
||||
|
||||
class Kling3TurboQueryResponse(BaseModel):
|
||||
code: int | None = Field(None)
|
||||
message: str | None = Field(None)
|
||||
request_id: str | None = Field(None)
|
||||
data: list[Kling3TurboTaskData] | None = Field(None)
|
||||
|
||||
@ -60,6 +60,12 @@ from comfy_api_nodes.apis.kling import (
|
||||
OmniProImageRequest,
|
||||
OmniProReferences2VideoRequest,
|
||||
OmniProText2VideoRequest,
|
||||
Kling3TurboSettings,
|
||||
Kling3TurboText2VideoRequest,
|
||||
Kling3TurboContent,
|
||||
Kling3TurboImage2VideoRequest,
|
||||
Kling3TurboCreateResponse,
|
||||
Kling3TurboQueryResponse,
|
||||
TaskStatusResponse,
|
||||
TextToVideoWithAudioRequest,
|
||||
)
|
||||
@ -2847,6 +2853,67 @@ class MotionControl(IO.ComfyNode):
|
||||
return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
|
||||
|
||||
|
||||
def build_turbo_shot_prompt(multi_prompt: list[MultiPromptEntry]) -> str:
|
||||
"""Render storyboard entries into the Turbo multi-shot prompt 'shot n, m, words; ...'."""
|
||||
return "; ".join(f"shot {i}, {int(e.duration)}, {e.prompt}" for i, e in enumerate(multi_prompt, 1)) + ";"
|
||||
|
||||
|
||||
def _turbo_video_url(response: Kling3TurboQueryResponse) -> str:
|
||||
"""Extract the result video URL from a /tasks response (data[].outputs[] where type == 'video')."""
|
||||
task = response.data[0] if response.data else None
|
||||
if task and task.outputs:
|
||||
for output in task.outputs:
|
||||
if output.type == "video" and output.url:
|
||||
return output.url
|
||||
raise RuntimeError(f"Kling 3.0 Turbo task finished without a video output: {response.model_dump()}")
|
||||
|
||||
|
||||
async def execute_kling_turbo(
|
||||
cls: type[IO.ComfyNode],
|
||||
*,
|
||||
prompt: str,
|
||||
resolution: str,
|
||||
aspect_ratio: str,
|
||||
duration: int,
|
||||
start_frame: torch.Tensor | None,
|
||||
) -> IO.NodeOutput:
|
||||
"""Create + poll a Kling 3.0 Turbo task. Image-to-video when start_frame is given, else text-to-video."""
|
||||
if start_frame is not None:
|
||||
validate_image_dimensions(start_frame, min_width=300, min_height=300)
|
||||
validate_image_aspect_ratio(start_frame, (1, 2.5), (2.5, 1))
|
||||
contents = [Kling3TurboContent(type="first_frame", url=tensor_to_base64_string(start_frame))]
|
||||
if prompt:
|
||||
contents.insert(0, Kling3TurboContent(type="prompt", text=prompt))
|
||||
create = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/kling/image-to-video/kling-3.0-turbo", method="POST"),
|
||||
response_model=Kling3TurboCreateResponse,
|
||||
data=Kling3TurboImage2VideoRequest(
|
||||
contents=contents,
|
||||
settings=Kling3TurboSettings(resolution=resolution, duration=duration), # i2v: no aspect_ratio
|
||||
),
|
||||
)
|
||||
else:
|
||||
create = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/kling/text-to-video/kling-3.0-turbo", method="POST"),
|
||||
response_model=Kling3TurboCreateResponse,
|
||||
data=Kling3TurboText2VideoRequest(
|
||||
prompt=prompt,
|
||||
settings=Kling3TurboSettings(resolution=resolution, aspect_ratio=aspect_ratio, duration=duration),
|
||||
),
|
||||
)
|
||||
if not (create.data and create.data.id):
|
||||
raise RuntimeError(f"Kling 3.0 Turbo create failed. Code: {create.code}, Message: {create.message}")
|
||||
final_response = await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/kling/tasks", query_params={"task_ids": create.data.id}),
|
||||
response_model=Kling3TurboQueryResponse,
|
||||
status_extractor=lambda r: (r.data[0].status if r.data else None),
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(_turbo_video_url(final_response)))
|
||||
|
||||
|
||||
class KlingVideoNode(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
@ -2884,7 +2951,11 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
],
|
||||
tooltip="Generate a series of video segments with individual prompts and durations.",
|
||||
),
|
||||
IO.Boolean.Input("generate_audio", default=True),
|
||||
IO.Boolean.Input(
|
||||
"generate_audio",
|
||||
default=True,
|
||||
tooltip="'kling-3.0-turbo' always generates native audio, so the audio toggle is ignored.",
|
||||
),
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
@ -2899,6 +2970,17 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
),
|
||||
],
|
||||
),
|
||||
IO.DynamicCombo.Option(
|
||||
"kling-3.0-turbo",
|
||||
[
|
||||
IO.Combo.Input("resolution", options=["1080p", "720p"], default="720p"),
|
||||
IO.Combo.Input(
|
||||
"aspect_ratio",
|
||||
options=["16:9", "9:16", "1:1"],
|
||||
tooltip="Ignored in image-to-video mode.",
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
tooltip="Model and generation settings.",
|
||||
),
|
||||
@ -2930,6 +3012,7 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(
|
||||
widgets=[
|
||||
"model",
|
||||
"model.resolution",
|
||||
"generate_audio",
|
||||
"multi_shot",
|
||||
@ -2944,14 +3027,7 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
),
|
||||
expr="""
|
||||
(
|
||||
$rates := {
|
||||
"4k": {"off": 0.42, "on": 0.42},
|
||||
"1080p": {"off": 0.112, "on": 0.168},
|
||||
"720p": {"off": 0.084, "on": 0.126}
|
||||
};
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$audio := widgets.generate_audio ? "on" : "off";
|
||||
$rate := $lookup($lookup($rates, $res), $audio);
|
||||
$ms := widgets.multi_shot;
|
||||
$isSb := $ms != "disabled";
|
||||
$n := $isSb ? $number($substring($ms, 0, 1)) : 0;
|
||||
@ -2962,7 +3038,18 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
$d5 := $n >= 5 ? $lookup(widgets, "multi_shot.storyboard_5_duration") : 0;
|
||||
$d6 := $n >= 6 ? $lookup(widgets, "multi_shot.storyboard_6_duration") : 0;
|
||||
$dur := $isSb ? $d1 + $d2 + $d3 + $d4 + $d5 + $d6 : $lookup(widgets, "multi_shot.duration");
|
||||
{"type":"usd","usd": $rate * $dur}
|
||||
widgets.model = "kling-3.0-turbo"
|
||||
? {"type":"usd","usd": ($res = "1080p" ? 0.14 : 0.112) * $dur}
|
||||
: (
|
||||
$rates := {
|
||||
"4k": {"off": 0.42, "on": 0.42},
|
||||
"1080p": {"off": 0.112, "on": 0.168},
|
||||
"720p": {"off": 0.084, "on": 0.126}
|
||||
};
|
||||
$audio := widgets.generate_audio ? "on" : "off";
|
||||
$rate := $lookup($lookup($rates, $res), $audio);
|
||||
{"type":"usd","usd": $rate * $dur}
|
||||
)
|
||||
)
|
||||
""",
|
||||
),
|
||||
@ -3015,6 +3102,17 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
duration = multi_shot["duration"]
|
||||
validate_string(multi_shot["prompt"], min_length=1, max_length=2500)
|
||||
|
||||
if model["model"] == "kling-3.0-turbo":
|
||||
turbo_prompt = build_turbo_shot_prompt(multi_prompt_list) if custom_multi_shot else multi_shot["prompt"]
|
||||
return await execute_kling_turbo(
|
||||
cls,
|
||||
prompt=turbo_prompt,
|
||||
resolution=model["resolution"],
|
||||
aspect_ratio=model["aspect_ratio"],
|
||||
duration=duration,
|
||||
start_frame=start_frame,
|
||||
)
|
||||
|
||||
if start_frame is not None:
|
||||
validate_image_dimensions(start_frame, min_width=300, min_height=300)
|
||||
validate_image_aspect_ratio(start_frame, (1, 2.5), (2.5, 1))
|
||||
|
||||
97
comfy_extras/nodes_boogu.py
Normal file
97
comfy_extras/nodes_boogu.py
Normal file
@ -0,0 +1,97 @@
|
||||
import math
|
||||
|
||||
import node_helpers
|
||||
import comfy.utils
|
||||
from typing_extensions import override
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
|
||||
|
||||
class TextEncodeBooguEdit(io.ComfyNode):
|
||||
"""Boogu-Image Edit conditioning.
|
||||
|
||||
The edit image is used twice, matching the reference pipeline:
|
||||
- Qwen3-VL vision tokens (instruction understanding) -> positive only
|
||||
- VAE reference latent (image identity) -> positive and negative
|
||||
The ref latent is in both conds so it cancels under CFG (identity preserved);
|
||||
the vision tokens are only in the positive so CFG amplifies the instruction.
|
||||
The tokenizer selects the right system prompt automatically (image -> TI2I,
|
||||
empty negative -> DROP), so no template plumbing is needed here.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="TextEncodeBooguEdit",
|
||||
category="model/conditioning/boogu",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("prompt", multiline=True, dynamic_prompts=True),
|
||||
io.String.Input("negative_prompt", multiline=True, dynamic_prompts=True, advanced=True),
|
||||
io.Vae.Input("vae"),
|
||||
io.Autogrow.Input(
|
||||
"images",
|
||||
template=io.Autogrow.TemplateNames(
|
||||
io.Image.Input("image"),
|
||||
names=[f"image_{i}" for i in range(1, 17)],
|
||||
min=0,
|
||||
),
|
||||
tooltip="Reference image(s) to edit. Boogu focuses on one reference per sample; more are allowed.",
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
io.Conditioning.Output(display_name="positive"),
|
||||
io.Conditioning.Output(display_name="negative"),
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, clip, prompt, negative_prompt, vae=None, images: io.Autogrow.Type = None) -> io.NodeOutput:
|
||||
ref_latents = []
|
||||
images_vl = []
|
||||
|
||||
images = images or {}
|
||||
for name in sorted(images, key=lambda n: int(n.rsplit("_", 1)[-1])):
|
||||
image = images[name]
|
||||
if image is None:
|
||||
continue
|
||||
samples = image.movedim(-1, 1)
|
||||
|
||||
# Vision tower input: the reference caps the VLM image at 384x384
|
||||
# (max_vlm_input_pil_pixels in pipeline_boogu.py).
|
||||
total = int(384 * 384)
|
||||
scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
|
||||
width = round(samples.shape[3] * scale_by)
|
||||
height = round(samples.shape[2] * scale_by)
|
||||
s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
|
||||
images_vl.append(s.movedim(1, -1)[:, :, :, :3])
|
||||
|
||||
# Reference latent: align to 16 px (VAE /8 * patch_size 2).
|
||||
if vae is not None:
|
||||
total = int(1024 * 1024)
|
||||
scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
|
||||
width = round(samples.shape[3] * scale_by / 16.0) * 16
|
||||
height = round(samples.shape[2] * scale_by / 16.0) * 16
|
||||
s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
|
||||
ref_latents.append(vae.encode(s.movedim(1, -1)[:, :, :, :3]))
|
||||
|
||||
# positive: instruction + vision tokens; negative: empty (no vision). Ref latent on both.
|
||||
positive = clip.encode_from_tokens_scheduled(clip.tokenize(prompt, images=images_vl))
|
||||
negative = clip.encode_from_tokens_scheduled(clip.tokenize(negative_prompt))
|
||||
|
||||
if len(ref_latents) > 0:
|
||||
positive = node_helpers.conditioning_set_values(positive, {"reference_latents": ref_latents}, append=True)
|
||||
negative = node_helpers.conditioning_set_values(negative, {"reference_latents": ref_latents}, append=True)
|
||||
|
||||
return io.NodeOutput(positive, negative)
|
||||
|
||||
|
||||
class BooguExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
TextEncodeBooguEdit,
|
||||
]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> BooguExtension:
|
||||
return BooguExtension()
|
||||
3
nodes.py
3
nodes.py
@ -969,7 +969,7 @@ class CLIPLoader:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ),
|
||||
"type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit", "ideogram4"], ),
|
||||
"type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit", "ideogram4", "boogu"], ),
|
||||
},
|
||||
"optional": {
|
||||
"device": (["default", "cpu"], {"advanced": True}),
|
||||
@ -2425,6 +2425,7 @@ async def init_builtin_extra_nodes():
|
||||
"nodes_tcfg.py",
|
||||
"nodes_context_windows.py",
|
||||
"nodes_qwen.py",
|
||||
"nodes_boogu.py",
|
||||
"nodes_chroma_radiance.py",
|
||||
"nodes_pid.py",
|
||||
"nodes_model_patch.py",
|
||||
|
||||
Reference in New Issue
Block a user