mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-05-23 01:28:21 +08:00
Compare commits
16 Commits
blueprints
...
v0.22.2
| Author | SHA1 | Date | |
|---|---|---|---|
| 85abace906 | |||
| f5d678d9ee | |||
| 59cafaf744 | |||
| 13e2d133a6 | |||
| ef46f5de76 | |||
| 7e02881b36 | |||
| a8d2519058 | |||
| 4efe1ddb5c | |||
| f9c84c94b4 | |||
| 78b5dec6b6 | |||
| 72e3f6081c | |||
| 7ec7b6ffe9 | |||
| 6887165a9d | |||
| cc4d711eb1 | |||
| 626b082838 | |||
| d0328b442d |
File diff suppressed because it is too large
Load Diff
@ -1,886 +0,0 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 675,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 675,
|
||||
"type": "01b6a731-fb78-4070-9a38-c87146da9604",
|
||||
"pos": [
|
||||
-2480,
|
||||
3400
|
||||
],
|
||||
"size": [
|
||||
360,
|
||||
433.3125
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "input",
|
||||
"name": "input",
|
||||
"type": "IMAGE,MASK",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "resize_target_longer_size",
|
||||
"name": "resize_type.longer_size",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "resize_type.longer_size"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "scale_method",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "scale_method"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "draw_body",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "draw_body"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "draw_hands",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "draw_hands"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "draw_face",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "draw_face"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "draw_feet",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "draw_feet"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "stick_width",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "stick_width"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "face_point_size",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "face_point_size"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "score_threshold",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "score_threshold"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "ckpt_name"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "bboxes",
|
||||
"shape": 7,
|
||||
"type": "BOUNDING_BOX",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"name": "keypoints",
|
||||
"type": "POSE_KEYPOINT",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"674",
|
||||
"resize_type.longer_size"
|
||||
],
|
||||
[
|
||||
"674",
|
||||
"scale_method"
|
||||
],
|
||||
[
|
||||
"672",
|
||||
"draw_body"
|
||||
],
|
||||
[
|
||||
"672",
|
||||
"draw_hands"
|
||||
],
|
||||
[
|
||||
"672",
|
||||
"draw_face"
|
||||
],
|
||||
[
|
||||
"672",
|
||||
"draw_feet"
|
||||
],
|
||||
[
|
||||
"672",
|
||||
"stick_width"
|
||||
],
|
||||
[
|
||||
"672",
|
||||
"face_point_size"
|
||||
],
|
||||
[
|
||||
"672",
|
||||
"score_threshold"
|
||||
],
|
||||
[
|
||||
"673",
|
||||
"ckpt_name"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.15.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Image to Pose Map (SDPose-OOD)"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "01b6a731-fb78-4070-9a38-c87146da9604",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 676,
|
||||
"lastLinkId": 1715,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Image to Pose Map (SDPose-OOD)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-3290,
|
||||
3590,
|
||||
190.8984375,
|
||||
288
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-1756.2451602089645,
|
||||
3366,
|
||||
128,
|
||||
88
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "e24699c3-1356-4634-9eb4-19bb58e5c0b0",
|
||||
"name": "input",
|
||||
"type": "IMAGE,MASK",
|
||||
"linkIds": [
|
||||
1700
|
||||
],
|
||||
"localized_name": "input",
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3614
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "088eefc1-cd8a-4573-993f-9e4da008a12d",
|
||||
"name": "resize_type.longer_size",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
1704
|
||||
],
|
||||
"label": "resize_target_longer_size",
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3634
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "b6449bd3-73d4-41c8-b81f-cf8d33f76a2e",
|
||||
"name": "scale_method",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
1705
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3654
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "4cff52ad-ed07-4c97-8803-fcbd89554fd0",
|
||||
"name": "draw_body",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
1706
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3674
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "7af63dce-f7df-4d7e-8215-d7c7f60bf81c",
|
||||
"name": "draw_hands",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
1707
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3694
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "af3a9bce-61f9-4aca-b530-9f65e028b35e",
|
||||
"name": "draw_face",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
1708
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3714
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "4620f6a3-2c85-4b79-ad8f-35d0326b568f",
|
||||
"name": "draw_feet",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
1709
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3734
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "fee5d0c9-8d4b-4934-81d8-ba2206dc56cb",
|
||||
"name": "stick_width",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
1710
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3754
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "aafdd060-ba81-4324-a9cc-b656e1ebc133",
|
||||
"name": "face_point_size",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
1711
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3774
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "514c5503-f9e6-4d23-b1ae-1d3291acb2a3",
|
||||
"name": "score_threshold",
|
||||
"type": "FLOAT",
|
||||
"linkIds": [
|
||||
1712
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3794
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "ae46de61-2cc6-483e-8ee9-87e4144a2ffa",
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
1713
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3814
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "41bec0c6-dffa-4c78-9289-ee678715ae54",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"linkIds": [
|
||||
1714
|
||||
],
|
||||
"pos": [
|
||||
-3123.1015625,
|
||||
3834
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "f05ed8cc-9403-4f14-8085-4364b06f8a48",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
1701
|
||||
],
|
||||
"localized_name": "IMAGE",
|
||||
"pos": [
|
||||
-1732.2451602089645,
|
||||
3390
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "29a6584e-4685-4986-8ffd-e6d8539953fd",
|
||||
"name": "keypoints",
|
||||
"type": "POSE_KEYPOINT",
|
||||
"linkIds": [
|
||||
1715
|
||||
],
|
||||
"pos": [
|
||||
-1732.2451602089645,
|
||||
3410
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 671,
|
||||
"type": "SDPoseKeypointExtractor",
|
||||
"pos": [
|
||||
-2470,
|
||||
3250
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
180
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "model",
|
||||
"name": "model",
|
||||
"type": "MODEL",
|
||||
"link": 1696
|
||||
},
|
||||
{
|
||||
"localized_name": "vae",
|
||||
"name": "vae",
|
||||
"type": "VAE",
|
||||
"link": 1697
|
||||
},
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 1698
|
||||
},
|
||||
{
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"shape": 7,
|
||||
"type": "BOUNDING_BOX",
|
||||
"link": 1714
|
||||
},
|
||||
{
|
||||
"localized_name": "batch_size",
|
||||
"name": "batch_size",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "batch_size"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "keypoints",
|
||||
"name": "keypoints",
|
||||
"type": "POSE_KEYPOINT",
|
||||
"links": [
|
||||
1699,
|
||||
1715
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "SDPoseKeypointExtractor",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.15.0",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
16
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 674,
|
||||
"type": "ResizeImageMaskNode",
|
||||
"pos": [
|
||||
-2960,
|
||||
3490
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "input",
|
||||
"name": "input",
|
||||
"type": "IMAGE,MASK",
|
||||
"link": 1700
|
||||
},
|
||||
{
|
||||
"localized_name": "resize_type",
|
||||
"name": "resize_type",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"widget": {
|
||||
"name": "resize_type"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "resize_type.longer_size",
|
||||
"name": "resize_type.longer_size",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "resize_type.longer_size"
|
||||
},
|
||||
"link": 1704
|
||||
},
|
||||
{
|
||||
"localized_name": "scale_method",
|
||||
"name": "scale_method",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "scale_method"
|
||||
},
|
||||
"link": 1705
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "resized",
|
||||
"name": "resized",
|
||||
"type": "*",
|
||||
"links": [
|
||||
1698
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ResizeImageMaskNode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.15.0",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
"scale longer dimension",
|
||||
1024,
|
||||
"area"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 672,
|
||||
"type": "SDPoseDrawKeypoints",
|
||||
"pos": [
|
||||
-2120,
|
||||
3260
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
280
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "keypoints",
|
||||
"name": "keypoints",
|
||||
"type": "POSE_KEYPOINT",
|
||||
"link": 1699
|
||||
},
|
||||
{
|
||||
"localized_name": "draw_body",
|
||||
"name": "draw_body",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "draw_body"
|
||||
},
|
||||
"link": 1706
|
||||
},
|
||||
{
|
||||
"localized_name": "draw_hands",
|
||||
"name": "draw_hands",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "draw_hands"
|
||||
},
|
||||
"link": 1707
|
||||
},
|
||||
{
|
||||
"localized_name": "draw_face",
|
||||
"name": "draw_face",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "draw_face"
|
||||
},
|
||||
"link": 1708
|
||||
},
|
||||
{
|
||||
"localized_name": "draw_feet",
|
||||
"name": "draw_feet",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "draw_feet"
|
||||
},
|
||||
"link": 1709
|
||||
},
|
||||
{
|
||||
"localized_name": "stick_width",
|
||||
"name": "stick_width",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "stick_width"
|
||||
},
|
||||
"link": 1710
|
||||
},
|
||||
{
|
||||
"localized_name": "face_point_size",
|
||||
"name": "face_point_size",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "face_point_size"
|
||||
},
|
||||
"link": 1711
|
||||
},
|
||||
{
|
||||
"localized_name": "score_threshold",
|
||||
"name": "score_threshold",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "score_threshold"
|
||||
},
|
||||
"link": 1712
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
1701
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "SDPoseDrawKeypoints",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.15.0",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
4,
|
||||
2,
|
||||
0.5
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 673,
|
||||
"type": "CheckpointLoaderSimple",
|
||||
"pos": [
|
||||
-2960,
|
||||
3250
|
||||
],
|
||||
"size": [
|
||||
390,
|
||||
190
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "ckpt_name",
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "ckpt_name"
|
||||
},
|
||||
"link": 1713
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MODEL",
|
||||
"name": "MODEL",
|
||||
"type": "MODEL",
|
||||
"links": [
|
||||
1696
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "CLIP",
|
||||
"name": "CLIP",
|
||||
"type": "CLIP",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"localized_name": "VAE",
|
||||
"name": "VAE",
|
||||
"type": "VAE",
|
||||
"links": [
|
||||
1697
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CheckpointLoaderSimple",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.15.0",
|
||||
"models": [
|
||||
{
|
||||
"name": "sdpose_wholebody_fp16.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/checkpoints/sdpose_wholebody_fp16.safetensors",
|
||||
"directory": "checkpoints"
|
||||
}
|
||||
],
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
"sdpose_wholebody_fp16.safetensors"
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 1696,
|
||||
"origin_id": 673,
|
||||
"origin_slot": 0,
|
||||
"target_id": 671,
|
||||
"target_slot": 0,
|
||||
"type": "MODEL"
|
||||
},
|
||||
{
|
||||
"id": 1697,
|
||||
"origin_id": 673,
|
||||
"origin_slot": 2,
|
||||
"target_id": 671,
|
||||
"target_slot": 1,
|
||||
"type": "VAE"
|
||||
},
|
||||
{
|
||||
"id": 1698,
|
||||
"origin_id": 674,
|
||||
"origin_slot": 0,
|
||||
"target_id": 671,
|
||||
"target_slot": 2,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 1699,
|
||||
"origin_id": 671,
|
||||
"origin_slot": 0,
|
||||
"target_id": 672,
|
||||
"target_slot": 0,
|
||||
"type": "POSE_KEYPOINT"
|
||||
},
|
||||
{
|
||||
"id": 1700,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 674,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE,MASK"
|
||||
},
|
||||
{
|
||||
"id": 1701,
|
||||
"origin_id": 672,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 1704,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 674,
|
||||
"target_slot": 2,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 1705,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 674,
|
||||
"target_slot": 3,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 1706,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 3,
|
||||
"target_id": 672,
|
||||
"target_slot": 1,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 1707,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 4,
|
||||
"target_id": 672,
|
||||
"target_slot": 2,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 1708,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"target_id": 672,
|
||||
"target_slot": 3,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 1709,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 6,
|
||||
"target_id": 672,
|
||||
"target_slot": 4,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 1710,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 7,
|
||||
"target_id": 672,
|
||||
"target_slot": 5,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 1711,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 8,
|
||||
"target_id": 672,
|
||||
"target_slot": 6,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 1712,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 9,
|
||||
"target_id": 672,
|
||||
"target_slot": 7,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 1713,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 10,
|
||||
"target_id": 673,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 1714,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 11,
|
||||
"target_id": 671,
|
||||
"target_slot": 3,
|
||||
"type": "BOUNDING_BOX"
|
||||
},
|
||||
{
|
||||
"id": 1715,
|
||||
"origin_id": 671,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "POSE_KEYPOINT"
|
||||
}
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {
|
||||
"ue_links": []
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,484 +0,0 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 10,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 10,
|
||||
"type": "3fb7557a-470d-4983-9d8c-6d5caa9788f0",
|
||||
"pos": [
|
||||
-250,
|
||||
8590
|
||||
],
|
||||
"size": [
|
||||
280,
|
||||
360
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "text_per_line",
|
||||
"name": "text_per_line",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "text_per_line"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "index",
|
||||
"name": "index",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "index"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "selected_line",
|
||||
"name": "selected_line",
|
||||
"type": "STRING",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"2",
|
||||
"string"
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"value"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.0",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Select Per-Line Text by Index"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "3fb7557a-470d-4983-9d8c-6d5caa9788f0",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 10,
|
||||
"lastLinkId": 14,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Select Per-Line Text by Index",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-990,
|
||||
8595,
|
||||
128,
|
||||
88
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
710,
|
||||
8585,
|
||||
128,
|
||||
68
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "75417d82-a934-4ac9-b667-d8dcd5a3bfb3",
|
||||
"name": "text_per_line",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
13
|
||||
],
|
||||
"localized_name": "text_per_line",
|
||||
"pos": [
|
||||
-886,
|
||||
8619
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "46e69a73-1804-4ca6-9175-31445bf0be96",
|
||||
"name": "index",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
14
|
||||
],
|
||||
"localized_name": "index",
|
||||
"pos": [
|
||||
-886,
|
||||
8639
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "e34e8ad1-84d2-4bd2-a460-eb7de6067c10",
|
||||
"name": "selected_line",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
10
|
||||
],
|
||||
"localized_name": "selected_line",
|
||||
"pos": [
|
||||
734,
|
||||
8609
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "PreviewAny",
|
||||
"pos": [
|
||||
-500,
|
||||
8400
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
180
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "source",
|
||||
"name": "source",
|
||||
"type": "*",
|
||||
"link": 1
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "STRING",
|
||||
"name": "STRING",
|
||||
"type": "STRING",
|
||||
"links": [
|
||||
6
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "PreviewAny",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.0",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
null,
|
||||
null,
|
||||
null
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "RegexExtract",
|
||||
"pos": [
|
||||
-240,
|
||||
8740
|
||||
],
|
||||
"size": [
|
||||
470,
|
||||
460
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"showAdvanced": false,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "string",
|
||||
"name": "string",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "string"
|
||||
},
|
||||
"link": 13
|
||||
},
|
||||
{
|
||||
"localized_name": "regex_pattern",
|
||||
"name": "regex_pattern",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "regex_pattern"
|
||||
},
|
||||
"link": 9
|
||||
},
|
||||
{
|
||||
"localized_name": "mode",
|
||||
"name": "mode",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "mode"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "case_insensitive",
|
||||
"name": "case_insensitive",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "case_insensitive"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "multiline",
|
||||
"name": "multiline",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "multiline"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "dotall",
|
||||
"name": "dotall",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "dotall"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "group_index",
|
||||
"name": "group_index",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "group_index"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "STRING",
|
||||
"name": "STRING",
|
||||
"type": "STRING",
|
||||
"links": [
|
||||
10
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "RegexExtract",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.0",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
"",
|
||||
"",
|
||||
"First Group",
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "PrimitiveInt",
|
||||
"pos": [
|
||||
-810,
|
||||
8400
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "value",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": 14
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
1
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Int (line index)",
|
||||
"properties": {
|
||||
"Node name for S&R": "Int (line index)",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.0",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
0,
|
||||
"fixed"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "StringReplace",
|
||||
"pos": [
|
||||
-240,
|
||||
8400
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
280
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "string",
|
||||
"name": "string",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "string"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "find",
|
||||
"name": "find",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "find"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "replace",
|
||||
"name": "replace",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "replace"
|
||||
},
|
||||
"link": 6
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "STRING",
|
||||
"name": "STRING",
|
||||
"type": "STRING",
|
||||
"links": [
|
||||
9
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "StringReplace",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.0",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
"(?:[^\\n]*\\n){index}([^\\n]+)",
|
||||
"index",
|
||||
""
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 1,
|
||||
"origin_id": 3,
|
||||
"origin_slot": 0,
|
||||
"target_id": 1,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"origin_id": 8,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 1,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"origin_id": 1,
|
||||
"origin_slot": 0,
|
||||
"target_id": 8,
|
||||
"target_slot": 2,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"origin_id": 2,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 0,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 3,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Text Tools"
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {
|
||||
"ue_links": [],
|
||||
"links_added_by_ue": []
|
||||
}
|
||||
}
|
||||
@ -1,713 +0,0 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 251,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 251,
|
||||
"type": "609e1fd1-b731-4b78-89ac-d19b1156b025",
|
||||
"pos": [
|
||||
-1490,
|
||||
130
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
164
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "source_image",
|
||||
"name": "source_image",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "columns",
|
||||
"name": "columns",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "columns"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "rows",
|
||||
"name": "rows",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "rows"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "tiles",
|
||||
"name": "tiles",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"228",
|
||||
"value"
|
||||
],
|
||||
[
|
||||
"252",
|
||||
"value"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.20.1",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Split Image Grid to Tiles"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "609e1fd1-b731-4b78-89ac-d19b1156b025",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 9,
|
||||
"lastNodeId": 252,
|
||||
"lastLinkId": 429,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Split Image Grid to Tiles",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-1690,
|
||||
260,
|
||||
128,
|
||||
108
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-510,
|
||||
590,
|
||||
128,
|
||||
68
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "866ac798-cfbc-450a-b755-e704f86404d9",
|
||||
"name": "source_image",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
386,
|
||||
389
|
||||
],
|
||||
"localized_name": "source_image",
|
||||
"pos": [
|
||||
-1586,
|
||||
284
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "bc37b1f8-8ab2-4f19-bd00-75d4fbc4feb3",
|
||||
"name": "columns",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
427
|
||||
],
|
||||
"localized_name": "columns",
|
||||
"pos": [
|
||||
-1586,
|
||||
304
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "d45915da-e848-43dd-9ccc-e3161e9c99d9",
|
||||
"name": "rows",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
428
|
||||
],
|
||||
"localized_name": "rows",
|
||||
"pos": [
|
||||
-1586,
|
||||
324
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "18bc780f-064b-4038-87c6-67dba71deb08",
|
||||
"name": "tiles",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
394
|
||||
],
|
||||
"localized_name": "tiles",
|
||||
"shape": 6,
|
||||
"pos": [
|
||||
-486,
|
||||
614
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 225,
|
||||
"type": "SplitImageToTileList",
|
||||
"pos": [
|
||||
-1010,
|
||||
620
|
||||
],
|
||||
"size": [
|
||||
290,
|
||||
170
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 386
|
||||
},
|
||||
{
|
||||
"localized_name": "tile_width",
|
||||
"name": "tile_width",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "tile_width"
|
||||
},
|
||||
"link": 403
|
||||
},
|
||||
{
|
||||
"localized_name": "tile_height",
|
||||
"name": "tile_height",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "tile_height"
|
||||
},
|
||||
"link": 404
|
||||
},
|
||||
{
|
||||
"localized_name": "overlap",
|
||||
"name": "overlap",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "overlap"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"shape": 6,
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
394
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "SplitImageToTileList",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.20.1",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
1024,
|
||||
1024,
|
||||
0
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 231,
|
||||
"type": "ComfyMathExpression",
|
||||
"pos": [
|
||||
-1080,
|
||||
330
|
||||
],
|
||||
"size": [
|
||||
370,
|
||||
190
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "a",
|
||||
"localized_name": "values.a",
|
||||
"name": "values.a",
|
||||
"type": "FLOAT,INT,BOOLEAN",
|
||||
"link": 390
|
||||
},
|
||||
{
|
||||
"label": "b",
|
||||
"localized_name": "values.b",
|
||||
"name": "values.b",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT,BOOLEAN",
|
||||
"link": 429
|
||||
},
|
||||
{
|
||||
"label": "c",
|
||||
"localized_name": "values.c",
|
||||
"name": "values.c",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT,BOOLEAN",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "expression",
|
||||
"name": "expression",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "expression"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
404
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "BOOL",
|
||||
"name": "BOOL",
|
||||
"type": "BOOLEAN",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"title": "Math Expression (Height)",
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfyMathExpression",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.18.1",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
"max(1, (int(a) + int(b) - 1) // int(b))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 229,
|
||||
"type": "ComfyMathExpression",
|
||||
"pos": [
|
||||
-1090,
|
||||
-30
|
||||
],
|
||||
"size": [
|
||||
370,
|
||||
190
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "a",
|
||||
"localized_name": "values.a",
|
||||
"name": "values.a",
|
||||
"type": "FLOAT,INT,BOOLEAN",
|
||||
"link": 387
|
||||
},
|
||||
{
|
||||
"label": "b",
|
||||
"localized_name": "values.b",
|
||||
"name": "values.b",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT,BOOLEAN",
|
||||
"link": 388
|
||||
},
|
||||
{
|
||||
"label": "c",
|
||||
"localized_name": "values.c",
|
||||
"name": "values.c",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT,BOOLEAN",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "expression",
|
||||
"name": "expression",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "expression"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
403
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "BOOL",
|
||||
"name": "BOOL",
|
||||
"type": "BOOLEAN",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"title": "Math Expression (Width)",
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfyMathExpression",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.18.1",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
"max(1, (int(a) + int(b) - 1) // int(b))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 228,
|
||||
"type": "PrimitiveInt",
|
||||
"pos": [
|
||||
-1380,
|
||||
90
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "value",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": 427
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
388
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Int (grid columns)",
|
||||
"properties": {
|
||||
"Node name for S&R": "Int (grid columns)",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.18.1",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
2,
|
||||
"fixed"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 230,
|
||||
"type": "GetImageSize",
|
||||
"pos": [
|
||||
-1380,
|
||||
290
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
100
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 389
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "width",
|
||||
"name": "width",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
387
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "height",
|
||||
"name": "height",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
390
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "batch_size",
|
||||
"name": "batch_size",
|
||||
"type": "INT",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetImageSize",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.18.1",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 252,
|
||||
"type": "PrimitiveInt",
|
||||
"pos": [
|
||||
-1380,
|
||||
470
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "value",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": 428
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
429
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Int (grid rows)",
|
||||
"properties": {
|
||||
"Node name for S&R": "Int (grid rows)",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.18.1",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
3,
|
||||
"fixed"
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 403,
|
||||
"origin_id": 229,
|
||||
"origin_slot": 1,
|
||||
"target_id": 225,
|
||||
"target_slot": 1,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 404,
|
||||
"origin_id": 231,
|
||||
"origin_slot": 1,
|
||||
"target_id": 225,
|
||||
"target_slot": 2,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 390,
|
||||
"origin_id": 230,
|
||||
"origin_slot": 1,
|
||||
"target_id": 231,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 387,
|
||||
"origin_id": 230,
|
||||
"origin_slot": 0,
|
||||
"target_id": 229,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 388,
|
||||
"origin_id": 228,
|
||||
"origin_slot": 0,
|
||||
"target_id": 229,
|
||||
"target_slot": 1,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 386,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 225,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 389,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 230,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 394,
|
||||
"origin_id": 225,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 427,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 228,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 428,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 252,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 429,
|
||||
"origin_id": 252,
|
||||
"origin_slot": 0,
|
||||
"target_id": 231,
|
||||
"target_slot": 1,
|
||||
"type": "INT"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Image Tools/Crop"
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -152,6 +152,11 @@ class StableAudio1(LatentFormat):
|
||||
latent_dimensions = 1
|
||||
temporal_downscale_ratio = 2048
|
||||
|
||||
class StableAudio3(LatentFormat):
|
||||
latent_channels = 256
|
||||
latent_dimensions = 1
|
||||
temporal_downscale_ratio = 4096
|
||||
|
||||
class Flux(SD3):
|
||||
latent_channels = 16
|
||||
def __init__(self):
|
||||
|
||||
@ -10,6 +10,17 @@ from torch import nn
|
||||
from torch.nn import functional as F
|
||||
import math
|
||||
import comfy.ops
|
||||
from .embedders import ExpoFourierFeatures
|
||||
|
||||
|
||||
def _left_pad_to_match(emb, target_len):
|
||||
emb_len = emb.shape[-2]
|
||||
if emb_len < target_len:
|
||||
return F.pad(emb, (0, 0, target_len - emb_len, 0), value=0.)
|
||||
elif emb_len > target_len:
|
||||
return emb[:, -target_len:, :]
|
||||
return emb
|
||||
|
||||
|
||||
class FourierFeatures(nn.Module):
|
||||
def __init__(self, in_features, out_features, std=1., dtype=None, device=None):
|
||||
@ -22,6 +33,7 @@ class FourierFeatures(nn.Module):
|
||||
f = 2 * math.pi * input @ comfy.ops.cast_to_input(self.weight.T, input)
|
||||
return torch.cat([f.cos(), f.sin()], dim=-1)
|
||||
|
||||
|
||||
# norms
|
||||
class LayerNorm(nn.Module):
|
||||
def __init__(self, dim, bias=False, fix_scale=False, dtype=None, device=None):
|
||||
@ -43,6 +55,16 @@ class LayerNorm(nn.Module):
|
||||
beta = comfy.ops.cast_to_input(beta, x)
|
||||
return F.layer_norm(x, x.shape[-1:], weight=comfy.ops.cast_to_input(self.gamma, x), bias=beta)
|
||||
|
||||
|
||||
class RMSNorm(nn.Module):
|
||||
def __init__(self, dim, dtype=None, device=None):
|
||||
super().__init__()
|
||||
self.gamma = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
|
||||
|
||||
def forward(self, x):
|
||||
return F.rms_norm(x, x.shape[-1:], weight=comfy.ops.cast_to_input(self.gamma, x))
|
||||
|
||||
|
||||
class GLU(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
@ -236,13 +258,6 @@ class FeedForward(nn.Module):
|
||||
|
||||
linear_out = operations.Linear(inner_dim, dim_out, bias = not no_bias, dtype=dtype, device=device) if not use_conv else operations.Conv1d(inner_dim, dim_out, conv_kernel_size, padding = (conv_kernel_size // 2), bias = not no_bias, dtype=dtype, device=device)
|
||||
|
||||
# # init last linear layer to 0
|
||||
# if zero_init_output:
|
||||
# nn.init.zeros_(linear_out.weight)
|
||||
# if not no_bias:
|
||||
# nn.init.zeros_(linear_out.bias)
|
||||
|
||||
|
||||
self.ff = nn.Sequential(
|
||||
linear_in,
|
||||
rearrange('b d n -> b n d') if use_conv else nn.Identity(),
|
||||
@ -261,8 +276,10 @@ class Attention(nn.Module):
|
||||
dim_context = None,
|
||||
causal = False,
|
||||
zero_init_output=True,
|
||||
qk_norm = False,
|
||||
qk_norm = "none",
|
||||
differential = False,
|
||||
natten_kernel_size = None,
|
||||
feat_scale = False,
|
||||
dtype=None,
|
||||
device=None,
|
||||
operations=None,
|
||||
@ -271,6 +288,7 @@ class Attention(nn.Module):
|
||||
self.dim = dim
|
||||
self.dim_heads = dim_heads
|
||||
self.causal = causal
|
||||
self.differential = differential
|
||||
|
||||
dim_kv = dim_context if dim_context is not None else dim
|
||||
|
||||
@ -278,18 +296,37 @@ class Attention(nn.Module):
|
||||
self.kv_heads = dim_kv // dim_heads
|
||||
|
||||
if dim_context is not None:
|
||||
self.to_q = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)
|
||||
self.to_kv = operations.Linear(dim_kv, dim_kv * 2, bias=False, dtype=dtype, device=device)
|
||||
if differential:
|
||||
self.to_q = operations.Linear(dim, dim * 2, bias=False, dtype=dtype, device=device)
|
||||
self.to_kv = operations.Linear(dim_kv, dim_kv * 3, bias=False, dtype=dtype, device=device)
|
||||
else:
|
||||
self.to_q = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)
|
||||
self.to_kv = operations.Linear(dim_kv, dim_kv * 2, bias=False, dtype=dtype, device=device)
|
||||
else:
|
||||
self.to_qkv = operations.Linear(dim, dim * 3, bias=False, dtype=dtype, device=device)
|
||||
if differential:
|
||||
self.to_qkv = operations.Linear(dim, dim * 5, bias=False, dtype=dtype, device=device)
|
||||
else:
|
||||
self.to_qkv = operations.Linear(dim, dim * 3, bias=False, dtype=dtype, device=device)
|
||||
|
||||
self.to_out = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)
|
||||
|
||||
# if zero_init_output:
|
||||
# nn.init.zeros_(self.to_out.weight)
|
||||
|
||||
# Accept bool for backward compat
|
||||
if isinstance(qk_norm, bool):
|
||||
qk_norm = "l2" if qk_norm else "none"
|
||||
self.qk_norm = qk_norm
|
||||
|
||||
if self.qk_norm == "ln":
|
||||
self.q_norm = operations.LayerNorm(dim_heads, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
|
||||
self.k_norm = operations.LayerNorm(dim_heads, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
|
||||
elif self.qk_norm == "rms":
|
||||
self.q_norm = RMSNorm(dim_heads, dtype=dtype, device=device)
|
||||
self.k_norm = RMSNorm(dim_heads, dtype=dtype, device=device)
|
||||
|
||||
self.feat_scale = feat_scale
|
||||
|
||||
if self.feat_scale:
|
||||
self.lambda_dc = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
|
||||
self.lambda_hf = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -306,22 +343,51 @@ class Attention(nn.Module):
|
||||
kv_input = context if has_context else x
|
||||
|
||||
if hasattr(self, 'to_q'):
|
||||
# Use separate linear projections for q and k/v
|
||||
q = self.to_q(x)
|
||||
q = rearrange(q, 'b n (h d) -> b h n d', h = h)
|
||||
if self.differential:
|
||||
# cross-attention differential: to_q → (q, q_diff), to_kv → (k, k_diff, v)
|
||||
q, q_diff = self.to_q(x).chunk(2, dim=-1)
|
||||
q = rearrange(q, 'b n (h d) -> b h n d', h=h)
|
||||
q_diff = rearrange(q_diff, 'b n (h d) -> b h n d', h=h)
|
||||
q = torch.stack([q, q_diff], dim=1) # (B, 2, H, N, D)
|
||||
k, k_diff, v = self.to_kv(kv_input).chunk(3, dim=-1)
|
||||
k = rearrange(k, 'b n (h d) -> b h n d', h=kv_h)
|
||||
k_diff = rearrange(k_diff, 'b n (h d) -> b h n d', h=kv_h)
|
||||
v = rearrange(v, 'b n (h d) -> b h n d', h=kv_h)
|
||||
k = torch.stack([k, k_diff], dim=1) # (B, 2, H, M, D)
|
||||
else:
|
||||
# Use separate linear projections for q and k/v
|
||||
q = self.to_q(x)
|
||||
q = rearrange(q, 'b n (h d) -> b h n d', h = h)
|
||||
|
||||
k, v = self.to_kv(kv_input).chunk(2, dim=-1)
|
||||
k, v = self.to_kv(kv_input).chunk(2, dim=-1)
|
||||
|
||||
k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = kv_h), (k, v))
|
||||
k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = kv_h), (k, v))
|
||||
else:
|
||||
# Use fused linear projection
|
||||
q, k, v = self.to_qkv(x).chunk(3, dim=-1)
|
||||
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
|
||||
if self.differential:
|
||||
# self-attention differential: to_qkv → (q, k, v, q_diff, k_diff)
|
||||
q, k, v, q_diff, k_diff = self.to_qkv(x).chunk(5, dim=-1)
|
||||
q, k, v, q_diff, k_diff = map(
|
||||
lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h),
|
||||
(q, k, v, q_diff, k_diff)
|
||||
)
|
||||
q = torch.stack([q, q_diff], dim=1) # (B, 2, H, N, D)
|
||||
k = torch.stack([k, k_diff], dim=1)
|
||||
else:
|
||||
# Use fused linear projection
|
||||
q, k, v = self.to_qkv(x).chunk(3, dim=-1)
|
||||
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
|
||||
|
||||
# Normalize q and k for cosine sim attention
|
||||
if self.qk_norm:
|
||||
if self.qk_norm == "l2":
|
||||
q = F.normalize(q, dim=-1)
|
||||
k = F.normalize(k, dim=-1)
|
||||
elif self.qk_norm == "rms":
|
||||
q_type, k_type = q.dtype, k.dtype
|
||||
q = self.q_norm(q).to(q_type)
|
||||
k = self.k_norm(k).to(k_type)
|
||||
elif self.qk_norm != 'none':
|
||||
q = self.q_norm(q)
|
||||
k = self.k_norm(k)
|
||||
|
||||
if rotary_pos_emb is not None and not has_context:
|
||||
freqs, _ = rotary_pos_emb
|
||||
@ -364,9 +430,24 @@ class Attention(nn.Module):
|
||||
heads_per_kv_head = h // kv_h
|
||||
k, v = map(lambda t: t.repeat_interleave(heads_per_kv_head, dim = 1), (k, v))
|
||||
|
||||
out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options)
|
||||
if self.differential:
|
||||
q, q_diff = q.unbind(dim=1)
|
||||
k, k_diff = k.unbind(dim=1)
|
||||
out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options)
|
||||
out_diff = optimized_attention(q_diff, k_diff, v, h, skip_reshape=True, transformer_options=transformer_options)
|
||||
out = out - out_diff
|
||||
else:
|
||||
out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options)
|
||||
|
||||
out = self.to_out(out)
|
||||
|
||||
if self.feat_scale:
|
||||
out_dc = out.mean(dim=-2, keepdim=True)
|
||||
out_hf = out - out_dc
|
||||
|
||||
# Selectively modulate DC and high frequency components
|
||||
out = out + comfy.ops.cast_to_input(self.lambda_dc, out) * out_dc + comfy.ops.cast_to_input(self.lambda_hf, out) * out_hf
|
||||
|
||||
if mask is not None:
|
||||
mask = rearrange(mask, 'b n -> b n 1')
|
||||
out = out.masked_fill(~mask, 0.)
|
||||
@ -417,11 +498,14 @@ class TransformerBlock(nn.Module):
|
||||
cross_attend = False,
|
||||
dim_context = None,
|
||||
global_cond_dim = None,
|
||||
global_cond_shared_embed = False,
|
||||
local_add_cond_dim = None,
|
||||
causal = False,
|
||||
zero_init_branch_outputs = True,
|
||||
conformer = False,
|
||||
layer_ix = -1,
|
||||
remove_norms = False,
|
||||
norm_type = "layer_norm",
|
||||
attn_kwargs = {},
|
||||
ff_kwargs = {},
|
||||
norm_kwargs = {},
|
||||
@ -436,8 +520,20 @@ class TransformerBlock(nn.Module):
|
||||
self.cross_attend = cross_attend
|
||||
self.dim_context = dim_context
|
||||
self.causal = causal
|
||||
self.global_cond_shared_embed = global_cond_shared_embed
|
||||
|
||||
self.pre_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity()
|
||||
norm_layer_map = {
|
||||
"layer_norm": LayerNorm,
|
||||
"rms_norm": RMSNorm,
|
||||
}
|
||||
norm_cls = norm_layer_map.get(norm_type, LayerNorm)
|
||||
|
||||
def make_norm():
|
||||
if remove_norms:
|
||||
return nn.Identity()
|
||||
return norm_cls(dim, dtype=dtype, device=device, **norm_kwargs)
|
||||
|
||||
self.pre_norm = make_norm()
|
||||
|
||||
self.self_attn = Attention(
|
||||
dim,
|
||||
@ -451,7 +547,7 @@ class TransformerBlock(nn.Module):
|
||||
)
|
||||
|
||||
if cross_attend:
|
||||
self.cross_attend_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity()
|
||||
self.cross_attend_norm = make_norm()
|
||||
self.cross_attn = Attention(
|
||||
dim,
|
||||
dim_heads = dim_heads,
|
||||
@ -464,37 +560,56 @@ class TransformerBlock(nn.Module):
|
||||
**attn_kwargs
|
||||
)
|
||||
|
||||
self.ff_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity()
|
||||
self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs, dtype=dtype, device=device, operations=operations,**ff_kwargs)
|
||||
self.ff_norm = make_norm()
|
||||
self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs, dtype=dtype, device=device, operations=operations, **ff_kwargs)
|
||||
|
||||
self.layer_ix = layer_ix
|
||||
|
||||
self.conformer = ConformerModule(dim, norm_kwargs=norm_kwargs) if conformer else None
|
||||
|
||||
self.global_cond_dim = global_cond_dim
|
||||
# Global conditioning
|
||||
self.has_global_cond = (global_cond_dim is not None) or global_cond_shared_embed
|
||||
|
||||
if global_cond_dim is not None:
|
||||
if global_cond_shared_embed:
|
||||
# SA3 style: learnable per-block additive bias; global_cond is pre-projected to (B, dim*6)
|
||||
self.to_scale_shift_gate = nn.Parameter(torch.empty(dim * 6, device=device, dtype=dtype))
|
||||
elif global_cond_dim is not None:
|
||||
# SA1 style: per-block MLP projects global_cond → (B, dim*6)
|
||||
self.to_scale_shift_gate = nn.Sequential(
|
||||
nn.SiLU(),
|
||||
nn.Linear(global_cond_dim, dim * 6, bias=False)
|
||||
operations.Linear(global_cond_dim, dim * 6, bias=False, device=device, dtype=dtype)
|
||||
)
|
||||
|
||||
nn.init.zeros_(self.to_scale_shift_gate[1].weight)
|
||||
#nn.init.zeros_(self.to_scale_shift_gate_self[1].bias)
|
||||
# Local additive conditioning (e.g. inpaint mask + masked latent)
|
||||
self.local_add_cond_dim = local_add_cond_dim
|
||||
if local_add_cond_dim is not None:
|
||||
self.to_local_embed = nn.Sequential(
|
||||
operations.Linear(local_add_cond_dim, dim, bias=True, dtype=dtype, device=device),
|
||||
nn.SiLU(),
|
||||
operations.Linear(dim, dim, bias=True, dtype=dtype, device=device),
|
||||
)
|
||||
else:
|
||||
self.to_local_embed = None
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x,
|
||||
context = None,
|
||||
global_cond=None,
|
||||
local_add_cond=None,
|
||||
mask = None,
|
||||
context_mask = None,
|
||||
rotary_pos_emb = None,
|
||||
transformer_options={}
|
||||
):
|
||||
if self.global_cond_dim is not None and self.global_cond_dim > 0 and global_cond is not None:
|
||||
if self.has_global_cond and global_cond is not None:
|
||||
if self.global_cond_shared_embed:
|
||||
# global_cond already has shape (B, dim*6)
|
||||
ssg = (comfy.ops.cast_to_input(self.to_scale_shift_gate, global_cond) + global_cond).unsqueeze(1)
|
||||
else:
|
||||
ssg = self.to_scale_shift_gate(global_cond).unsqueeze(1)
|
||||
|
||||
scale_self, shift_self, gate_self, scale_ff, shift_ff, gate_ff = self.to_scale_shift_gate(global_cond).unsqueeze(1).chunk(6, dim = -1)
|
||||
scale_self, shift_self, gate_self, scale_ff, shift_ff, gate_ff = ssg.chunk(6, dim = -1)
|
||||
|
||||
# self-attention with adaLN
|
||||
residual = x
|
||||
@ -510,6 +625,9 @@ class TransformerBlock(nn.Module):
|
||||
if self.conformer is not None:
|
||||
x = x + self.conformer(x)
|
||||
|
||||
if local_add_cond is not None and self.to_local_embed is not None:
|
||||
x = x + _left_pad_to_match(self.to_local_embed(local_add_cond), x.shape[-2])
|
||||
|
||||
# feedforward with adaLN
|
||||
residual = x
|
||||
x = self.ff_norm(x)
|
||||
@ -527,6 +645,9 @@ class TransformerBlock(nn.Module):
|
||||
if self.conformer is not None:
|
||||
x = x + self.conformer(x)
|
||||
|
||||
if local_add_cond is not None and self.to_local_embed is not None:
|
||||
x = x + _left_pad_to_match(self.to_local_embed(local_add_cond), x.shape[-2])
|
||||
|
||||
x = x + self.ff(self.ff_norm(x))
|
||||
|
||||
return x
|
||||
@ -543,6 +664,8 @@ class ContinuousTransformer(nn.Module):
|
||||
cross_attend=False,
|
||||
cond_token_dim=None,
|
||||
global_cond_dim=None,
|
||||
global_cond_shared_embed=False,
|
||||
local_add_cond_dim=None,
|
||||
causal=False,
|
||||
rotary_pos_emb=True,
|
||||
zero_init_branch_outputs=True,
|
||||
@ -550,6 +673,7 @@ class ContinuousTransformer(nn.Module):
|
||||
use_sinusoidal_emb=False,
|
||||
use_abs_pos_emb=False,
|
||||
abs_pos_emb_max_length=10000,
|
||||
num_memory_tokens=0,
|
||||
dtype=None,
|
||||
device=None,
|
||||
operations=None,
|
||||
@ -562,6 +686,8 @@ class ContinuousTransformer(nn.Module):
|
||||
self.depth = depth
|
||||
self.causal = causal
|
||||
self.layers = nn.ModuleList([])
|
||||
self.num_memory_tokens = num_memory_tokens
|
||||
self.global_cond_shared_embed = global_cond_shared_embed
|
||||
|
||||
self.project_in = operations.Linear(dim_in, dim, bias=False, dtype=dtype, device=device) if dim_in is not None else nn.Identity()
|
||||
self.project_out = operations.Linear(dim, dim_out, bias=False, dtype=dtype, device=device) if dim_out is not None else nn.Identity()
|
||||
@ -577,7 +703,22 @@ class ContinuousTransformer(nn.Module):
|
||||
|
||||
self.use_abs_pos_emb = use_abs_pos_emb
|
||||
if use_abs_pos_emb:
|
||||
self.pos_emb = AbsolutePositionalEmbedding(dim, abs_pos_emb_max_length)
|
||||
self.pos_emb = AbsolutePositionalEmbedding(dim, abs_pos_emb_max_length + num_memory_tokens)
|
||||
|
||||
if num_memory_tokens > 0:
|
||||
self.memory_tokens = nn.Parameter(torch.empty(num_memory_tokens, dim, device=device, dtype=dtype))
|
||||
|
||||
# Shared global-cond embedder (SA3 style): projects (B, global_cond_dim) → (B, dim*6)
|
||||
self.global_cond_embedder = None
|
||||
if global_cond_shared_embed and global_cond_dim is not None:
|
||||
self.global_cond_embedder = nn.Sequential(
|
||||
operations.Linear(global_cond_dim, dim, bias=True, dtype=dtype, device=device),
|
||||
nn.SiLU(),
|
||||
operations.Linear(dim, dim * 6, bias=True, dtype=dtype, device=device),
|
||||
)
|
||||
|
||||
# When using shared embed, TransformerBlocks use per-block Parameter (not per-block MLP)
|
||||
block_global_cond_dim = None if global_cond_shared_embed else global_cond_dim
|
||||
|
||||
for i in range(depth):
|
||||
self.layers.append(
|
||||
@ -586,7 +727,9 @@ class ContinuousTransformer(nn.Module):
|
||||
dim_heads = dim_heads,
|
||||
cross_attend = cross_attend,
|
||||
dim_context = cond_token_dim,
|
||||
global_cond_dim = global_cond_dim,
|
||||
global_cond_dim = block_global_cond_dim,
|
||||
global_cond_shared_embed = global_cond_shared_embed,
|
||||
local_add_cond_dim = local_add_cond_dim,
|
||||
causal = causal,
|
||||
zero_init_branch_outputs = zero_init_branch_outputs,
|
||||
conformer=conformer,
|
||||
@ -605,6 +748,7 @@ class ContinuousTransformer(nn.Module):
|
||||
prepend_embeds = None,
|
||||
prepend_mask = None,
|
||||
global_cond = None,
|
||||
local_add_cond = None,
|
||||
return_info = False,
|
||||
**kwargs
|
||||
):
|
||||
@ -632,7 +776,9 @@ class ContinuousTransformer(nn.Module):
|
||||
|
||||
mask = torch.cat((prepend_mask, mask), dim = -1)
|
||||
|
||||
# Attention layers
|
||||
if self.num_memory_tokens > 0:
|
||||
memory_tokens = comfy.ops.cast_to_input(self.memory_tokens, x).expand(batch, -1, -1)
|
||||
x = torch.cat((memory_tokens, x), dim=1)
|
||||
|
||||
if self.rotary_pos_emb is not None:
|
||||
rotary_pos_emb = self.rotary_pos_emb.forward_from_seq_len(x.shape[1], dtype=torch.float, device=x.device)
|
||||
@ -642,6 +788,10 @@ class ContinuousTransformer(nn.Module):
|
||||
if self.use_sinusoidal_emb or self.use_abs_pos_emb:
|
||||
x = x + self.pos_emb(x)
|
||||
|
||||
# Project global_cond once (SA3 shared-embed path)
|
||||
if global_cond is not None and self.global_cond_embedder is not None:
|
||||
global_cond = self.global_cond_embedder(global_cond)
|
||||
|
||||
blocks_replace = patches_replace.get("dit", {})
|
||||
# Iterate over the transformer layers
|
||||
for i, layer in enumerate(self.layers):
|
||||
@ -654,12 +804,17 @@ class ContinuousTransformer(nn.Module):
|
||||
out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": global_cond, "pe": rotary_pos_emb, "transformer_options": transformer_options}, {"original_block": block_wrap})
|
||||
x = out["img"]
|
||||
else:
|
||||
x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, context=context, transformer_options=transformer_options)
|
||||
# x = checkpoint(layer, x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs)
|
||||
x = layer(x, rotary_pos_emb=rotary_pos_emb, global_cond=global_cond,
|
||||
local_add_cond=local_add_cond, context=context,
|
||||
transformer_options=transformer_options)
|
||||
|
||||
if return_info:
|
||||
info["hidden_states"].append(x)
|
||||
|
||||
# Strip memory tokens before projecting out
|
||||
if self.num_memory_tokens > 0:
|
||||
x = x[:, self.num_memory_tokens:, :]
|
||||
|
||||
x = self.project_out(x)
|
||||
|
||||
if return_info:
|
||||
@ -682,6 +837,7 @@ class AudioDiffusionTransformer(nn.Module):
|
||||
num_heads=24,
|
||||
transformer_type: tp.Literal["continuous_transformer"] = "continuous_transformer",
|
||||
global_cond_type: tp.Literal["prepend", "adaLN"] = "prepend",
|
||||
timestep_features_type: str = "learned",
|
||||
audio_model="",
|
||||
dtype=None,
|
||||
device=None,
|
||||
@ -696,7 +852,10 @@ class AudioDiffusionTransformer(nn.Module):
|
||||
# Timestep embeddings
|
||||
timestep_features_dim = 256
|
||||
|
||||
self.timestep_features = FourierFeatures(1, timestep_features_dim, dtype=dtype, device=device)
|
||||
if timestep_features_type == "expo":
|
||||
self.timestep_features = ExpoFourierFeatures(timestep_features_dim, 0.5, 10000.0)
|
||||
else:
|
||||
self.timestep_features = FourierFeatures(1, timestep_features_dim, dtype=dtype, device=device)
|
||||
|
||||
self.to_timestep_embed = nn.Sequential(
|
||||
operations.Linear(timestep_features_dim, embed_dim, bias=True, dtype=dtype, device=device),
|
||||
@ -781,6 +940,7 @@ class AudioDiffusionTransformer(nn.Module):
|
||||
cross_attn_cond=None,
|
||||
cross_attn_cond_mask=None,
|
||||
input_concat_cond=None,
|
||||
local_add_cond=None,
|
||||
global_embed=None,
|
||||
prepend_cond=None,
|
||||
prepend_cond_mask=None,
|
||||
@ -802,9 +962,13 @@ class AudioDiffusionTransformer(nn.Module):
|
||||
prepend_cond = self.to_prepend_embed(prepend_cond)
|
||||
|
||||
prepend_inputs = prepend_cond
|
||||
prepend_length = prepend_cond.shape[1]
|
||||
if prepend_cond_mask is not None:
|
||||
prepend_mask = prepend_cond_mask
|
||||
|
||||
if local_add_cond is not None and local_add_cond.dim() == 3:
|
||||
local_add_cond = local_add_cond.permute(0, 2, 1)
|
||||
|
||||
if input_concat_cond is not None:
|
||||
|
||||
# Interpolate input_concat_cond to the same length as x
|
||||
@ -850,7 +1014,7 @@ class AudioDiffusionTransformer(nn.Module):
|
||||
if self.transformer_type == "x-transformers":
|
||||
output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, **extra_args, **kwargs)
|
||||
elif self.transformer_type == "continuous_transformer":
|
||||
output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, return_info=return_info, **extra_args, **kwargs)
|
||||
output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, return_info=return_info, local_add_cond=local_add_cond, **extra_args, **kwargs)
|
||||
|
||||
if return_info:
|
||||
output, info = output
|
||||
@ -876,6 +1040,7 @@ class AudioDiffusionTransformer(nn.Module):
|
||||
context=None,
|
||||
context_mask=None,
|
||||
input_concat_cond=None,
|
||||
local_add_cond=None,
|
||||
global_embed=None,
|
||||
negative_global_embed=None,
|
||||
prepend_cond=None,
|
||||
@ -890,6 +1055,7 @@ class AudioDiffusionTransformer(nn.Module):
|
||||
cross_attn_cond=context,
|
||||
cross_attn_cond_mask=context_mask,
|
||||
input_concat_cond=input_concat_cond,
|
||||
local_add_cond=local_add_cond,
|
||||
global_embed=global_embed,
|
||||
prepend_cond=prepend_cond,
|
||||
prepend_cond_mask=prepend_cond_mask,
|
||||
|
||||
@ -31,15 +31,39 @@ def TimePositionalEmbedding(dim: int, out_features: int) -> nn.Module:
|
||||
)
|
||||
|
||||
|
||||
class ExpoFourierFeatures(nn.Module):
|
||||
"""Exponentially-spaced Fourier features (no learnable parameters)."""
|
||||
def __init__(self, dim, min_freq=0.5, max_freq=10000.0):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.min_freq = min_freq
|
||||
self.max_freq = max_freq
|
||||
|
||||
def forward(self, t):
|
||||
in_dtype = t.dtype
|
||||
t = t.float()
|
||||
if t.dim() == 1:
|
||||
t = t.unsqueeze(-1)
|
||||
half_dim = self.dim // 2
|
||||
ramp = torch.linspace(0, 1, half_dim, device=t.device, dtype=torch.float32)
|
||||
freqs = torch.exp(ramp * (math.log(self.max_freq) - math.log(self.min_freq)) + math.log(self.min_freq))
|
||||
args = t * freqs * 2 * math.pi
|
||||
return torch.cat([args.cos(), args.sin()], dim=-1).to(in_dtype)
|
||||
|
||||
|
||||
class NumberEmbedder(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
features: int,
|
||||
dim: int = 256,
|
||||
fourier_features_type="learned",
|
||||
):
|
||||
super().__init__()
|
||||
self.features = features
|
||||
self.embedding = TimePositionalEmbedding(dim=dim, out_features=features)
|
||||
if fourier_features_type == "expo":
|
||||
self.embedding = nn.Sequential(ExpoFourierFeatures(dim=dim), comfy.ops.manual_cast.Linear(in_features=dim, out_features=features))
|
||||
else:
|
||||
self.embedding = TimePositionalEmbedding(dim=dim, out_features=features)
|
||||
|
||||
def forward(self, x: Union[List[float], Tensor]) -> Tensor:
|
||||
if not torch.is_tensor(x):
|
||||
@ -77,14 +101,15 @@ class NumberConditioner(Conditioner):
|
||||
def __init__(self,
|
||||
output_dim: int,
|
||||
min_val: float=0,
|
||||
max_val: float=1
|
||||
max_val: float=1,
|
||||
fourier_features_type: str = "learned",
|
||||
):
|
||||
super().__init__(output_dim, output_dim)
|
||||
|
||||
self.min_val = min_val
|
||||
self.max_val = max_val
|
||||
|
||||
self.embedder = NumberEmbedder(features=output_dim)
|
||||
self.embedder = NumberEmbedder(features=output_dim, fourier_features_type=fourier_features_type)
|
||||
|
||||
def forward(self, floats, device=None):
|
||||
# Cast the inputs to floats
|
||||
|
||||
533
comfy/ldm/audio/vae_sa3.py
Normal file
533
comfy/ldm/audio/vae_sa3.py
Normal file
@ -0,0 +1,533 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
import comfy.ops
|
||||
import comfy.model_management
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
from comfy.ldm.audio.autoencoder import WNConv1d
|
||||
|
||||
ops = comfy.ops.disable_weight_init
|
||||
|
||||
class Transpose(nn.Module):
|
||||
def forward(self, x, **kwargs):
|
||||
return x.transpose(-2, -1)
|
||||
|
||||
|
||||
def _zero_pad_modulo_sequence(x, size, dim=-2):
|
||||
input_len = x.shape[dim]
|
||||
pad_len = (size - input_len % size) % size
|
||||
if pad_len > 0:
|
||||
pad_shape = list(x.shape)
|
||||
pad_shape[dim] = pad_len
|
||||
x = torch.cat([x, torch.zeros(pad_shape, device=x.device, dtype=x.dtype)], dim=dim)
|
||||
return x
|
||||
|
||||
|
||||
def _sliding_window_mask(seq_len, window, device, dtype):
|
||||
"""Additive attention mask enforcing a ±window local window (matches flash_attn window_size)."""
|
||||
i = torch.arange(seq_len, device=device).unsqueeze(1)
|
||||
j = torch.arange(seq_len, device=device).unsqueeze(0)
|
||||
out_of_window = (j - i).abs() > window
|
||||
return torch.where(
|
||||
out_of_window,
|
||||
torch.full((1,), torch.finfo(dtype).min / 4, device=device, dtype=dtype),
|
||||
torch.zeros(1, device=device, dtype=dtype),
|
||||
)
|
||||
|
||||
|
||||
class DynamicTanh(nn.Module):
|
||||
def __init__(self, dim, init_alpha=4.0, dtype=None, device=None, **kwargs):
|
||||
super().__init__()
|
||||
self.alpha = nn.Parameter(torch.empty(1, dtype=dtype, device=device))
|
||||
self.gamma = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
|
||||
self.beta = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
|
||||
|
||||
def forward(self, x):
|
||||
alpha = comfy.ops.cast_to_input(self.alpha, x)
|
||||
gamma = comfy.ops.cast_to_input(self.gamma, x)
|
||||
beta = comfy.ops.cast_to_input(self.beta, x)
|
||||
return gamma * torch.tanh(alpha * x) + beta
|
||||
|
||||
|
||||
class RotaryEmbedding(nn.Module):
|
||||
def __init__(self, dim, base=10000, base_rescale_factor=1., dtype=None, device=None):
|
||||
super().__init__()
|
||||
base = base * base_rescale_factor ** (dim / (dim - 2))
|
||||
self.register_buffer("inv_freq", torch.empty(dim // 2, dtype=dtype, device=device))
|
||||
|
||||
def forward_from_seq_len(self, seq_len, device, dtype=None):
|
||||
t = torch.arange(seq_len, device=device, dtype=torch.float32)
|
||||
return self.forward(t)
|
||||
|
||||
def forward(self, t):
|
||||
freqs = torch.outer(t.float(), comfy.model_management.cast_to(self.inv_freq, dtype=torch.float32, device=t.device))
|
||||
freqs = torch.cat((freqs, freqs), dim=-1)
|
||||
return freqs, 1.
|
||||
|
||||
|
||||
def _rotate_half(x):
|
||||
d = x.shape[-1] // 2
|
||||
return torch.cat((-x[..., d:], x[..., :d]), dim=-1)
|
||||
|
||||
|
||||
def _apply_rotary_pos_emb(t, freqs):
|
||||
out_dtype = t.dtype
|
||||
rot_dim = freqs.shape[-1]
|
||||
seq_len = t.shape[-2]
|
||||
freqs = freqs[-seq_len:]
|
||||
t_rot, t_pass = t[..., :rot_dim], t[..., rot_dim:]
|
||||
t_rot = t_rot * freqs.cos() + _rotate_half(t_rot) * freqs.sin()
|
||||
return torch.cat((t_rot.to(out_dtype), t_pass.to(out_dtype)), dim=-1)
|
||||
|
||||
|
||||
class Attention(nn.Module):
|
||||
def __init__(self, dim, dim_heads=64, qk_norm="none", qk_norm_eps=1e-6,
|
||||
differential=False, zero_init_output=True,
|
||||
dtype=None, device=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
self.num_heads = dim // dim_heads
|
||||
self.differential = differential
|
||||
self.qk_norm = qk_norm
|
||||
|
||||
self.to_qkv = operations.Linear(
|
||||
dim, dim * (5 if differential else 3), bias=False, dtype=dtype, device=device)
|
||||
self.to_out = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)
|
||||
|
||||
if qk_norm == "dyt":
|
||||
self.q_norm = DynamicTanh(dim_heads, dtype=dtype, device=device)
|
||||
self.k_norm = DynamicTanh(dim_heads, dtype=dtype, device=device)
|
||||
elif qk_norm == "rms":
|
||||
self.q_norm = operations.RMSNorm(dim_heads, eps=qk_norm_eps, dtype=dtype, device=device)
|
||||
self.k_norm = operations.RMSNorm(dim_heads, eps=qk_norm_eps, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x, rotary_pos_emb=None, mask=None, **kwargs):
|
||||
B, N, _ = x.shape
|
||||
h = self.num_heads
|
||||
|
||||
qkv = self.to_qkv(x)
|
||||
if self.differential:
|
||||
q, k, v, q_diff, k_diff = qkv.chunk(5, dim=-1)
|
||||
del qkv
|
||||
q = q.view(B, N, h, -1).transpose(1, 2)
|
||||
k = k.view(B, N, h, -1).transpose(1, 2)
|
||||
v = v.view(B, N, h, -1).transpose(1, 2)
|
||||
q_diff = q_diff.view(B, N, h, -1).transpose(1, 2)
|
||||
k_diff = k_diff.view(B, N, h, -1).transpose(1, 2)
|
||||
else:
|
||||
q, k, v = qkv.chunk(3, dim=-1)
|
||||
del qkv
|
||||
q = q.view(B, N, h, -1).transpose(1, 2)
|
||||
k = k.view(B, N, h, -1).transpose(1, 2)
|
||||
v = v.view(B, N, h, -1).transpose(1, 2)
|
||||
|
||||
if self.qk_norm != "none":
|
||||
q_dtype, k_dtype = q.dtype, k.dtype
|
||||
q = self.q_norm(q).to(q_dtype)
|
||||
k = self.k_norm(k).to(k_dtype)
|
||||
if self.differential:
|
||||
q_diff = self.q_norm(q_diff).to(q_dtype)
|
||||
k_diff = self.k_norm(k_diff).to(k_dtype)
|
||||
|
||||
if rotary_pos_emb is not None:
|
||||
freqs, _ = rotary_pos_emb
|
||||
q_dtype, k_dtype = q.dtype, k.dtype
|
||||
q = _apply_rotary_pos_emb(q.float(), freqs).to(q_dtype)
|
||||
k = _apply_rotary_pos_emb(k.float(), freqs).to(k_dtype)
|
||||
if self.differential:
|
||||
q_diff = _apply_rotary_pos_emb(q_diff.float(), freqs).to(q_dtype)
|
||||
k_diff = _apply_rotary_pos_emb(k_diff.float(), freqs).to(k_dtype)
|
||||
|
||||
if self.differential:
|
||||
out = (optimized_attention(q, k, v, h, mask=mask, skip_reshape=True)
|
||||
- optimized_attention(q_diff, k_diff, v, h, mask=mask, skip_reshape=True))
|
||||
del q, k, v, q_diff, k_diff
|
||||
else:
|
||||
out = optimized_attention(q, k, v, h, mask=mask, skip_reshape=True)
|
||||
del q, k, v
|
||||
|
||||
return self.to_out(out)
|
||||
|
||||
|
||||
class _Sin(nn.Module):
|
||||
def forward(self, x):
|
||||
return torch.sin(3.14159265359 * x)
|
||||
|
||||
|
||||
class _GLU(nn.Module):
|
||||
def __init__(self, dim_in, dim_out, activation, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.act = activation
|
||||
self.proj = operations.Linear(dim_in, dim_out * 2, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.proj(x)
|
||||
x, gate = x.chunk(2, dim=-1)
|
||||
return x * self.act(gate)
|
||||
|
||||
|
||||
class FeedForward(nn.Module):
|
||||
def __init__(self, dim, mult=4, no_bias=False, zero_init_output=True,
|
||||
sinusoidal=False, dtype=None, device=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
inner_dim = int(dim * mult)
|
||||
act = _Sin() if sinusoidal else nn.SiLU()
|
||||
self.ff = nn.Sequential(
|
||||
_GLU(dim, inner_dim, act, dtype=dtype, device=device, operations=operations),
|
||||
nn.Identity(),
|
||||
operations.Linear(inner_dim, dim, bias=not no_bias, dtype=dtype, device=device),
|
||||
nn.Identity(),
|
||||
)
|
||||
|
||||
def forward(self, x, **kwargs):
|
||||
return self.ff(x)
|
||||
|
||||
|
||||
class TransformerBlock(nn.Module):
|
||||
def __init__(self, dim, dim_heads=64, causal=False, zero_init_branch_outputs=True,
|
||||
norm_type="dyt", add_rope=False, attn_kwargs=None, ff_kwargs=None,
|
||||
norm_kwargs=None, dtype=None, device=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
if attn_kwargs is None:
|
||||
attn_kwargs = {}
|
||||
if ff_kwargs is None:
|
||||
ff_kwargs = {}
|
||||
if norm_kwargs is None:
|
||||
norm_kwargs = {}
|
||||
dim_heads = min(dim_heads, dim)
|
||||
|
||||
Norm = DynamicTanh if norm_type == "dyt" else operations.RMSNorm
|
||||
norm_kw = {**norm_kwargs, "dtype": dtype, "device": device}
|
||||
|
||||
self.pre_norm = Norm(dim, **norm_kw)
|
||||
self.self_attn = Attention(dim, dim_heads=dim_heads,
|
||||
zero_init_output=zero_init_branch_outputs,
|
||||
dtype=dtype, device=device, operations=operations,
|
||||
**attn_kwargs)
|
||||
self.ff_norm = Norm(dim, **norm_kw)
|
||||
self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs,
|
||||
dtype=dtype, device=device, operations=operations, **ff_kwargs)
|
||||
self.rope = RotaryEmbedding(dim_heads // 2, dtype=dtype, device=device) if add_rope else None
|
||||
|
||||
def forward(self, x, mask=None, **kwargs):
|
||||
rope = self.rope.forward_from_seq_len(x.shape[-2], device=x.device) \
|
||||
if self.rope is not None else None
|
||||
x = x + self.self_attn(self.pre_norm(x), rotary_pos_emb=rope, mask=mask)
|
||||
x = x + self.ff(self.ff_norm(x))
|
||||
return x
|
||||
|
||||
|
||||
class TransformerResamplingBlock(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, stride, type="encoder",
|
||||
transformer_depth=3, dim_heads=128, differential=True,
|
||||
sliding_window=None, chunk_size=128, chunk_midpoint_shift=False,
|
||||
dyt=True, ff_mult=3, mapping_bias=True, variable_stride=False,
|
||||
sinusoidal_blocks=0, conv_mapping=False, dtype=None, device=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
if type not in ("encoder", "decoder"):
|
||||
raise ValueError(f"type must be 'encoder' or 'decoder', got {type!r}")
|
||||
|
||||
self.type = type
|
||||
self.stride = stride
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_midpoint_shift = chunk_midpoint_shift
|
||||
self.variable_stride = variable_stride
|
||||
self.transformer_depth = transformer_depth
|
||||
|
||||
transformer_dim = out_channels if type == "encoder" else in_channels
|
||||
|
||||
self.mapping = (WNConv1d(in_channels, out_channels, 3 if conv_mapping else 1, padding="same", bias=mapping_bias)
|
||||
if in_channels != out_channels else nn.Identity())
|
||||
|
||||
self.sliding_window_latents = sliding_window
|
||||
self.sliding_window_seq = self._get_sliding_window_size(sliding_window, stride)
|
||||
self.input_seg_size, self.output_seg_size, self.sub_chunk_size = self._get_seg_sizes(stride)
|
||||
|
||||
token_seq = 1 if variable_stride else self.output_seg_size
|
||||
self.new_tokens = nn.Parameter(torch.empty(1, token_seq, transformer_dim, dtype=dtype, device=device))
|
||||
|
||||
norm_type = "dyt" if dyt else "rms_norm"
|
||||
attn_kwargs = {"qk_norm": "dyt" if dyt else "rms", "qk_norm_eps": 1e-3,
|
||||
"differential": differential}
|
||||
norm_kwargs = {"eps": 1e-3}
|
||||
transformers = []
|
||||
for i in range(transformer_depth):
|
||||
sinusoidal = (transformer_depth - i) < sinusoidal_blocks
|
||||
transformers.append(TransformerBlock(
|
||||
transformer_dim,
|
||||
dim_heads=dim_heads,
|
||||
causal=False,
|
||||
zero_init_branch_outputs=True,
|
||||
norm_type=norm_type,
|
||||
add_rope=True,
|
||||
attn_kwargs=attn_kwargs,
|
||||
ff_kwargs={"mult": ff_mult, "no_bias": False, "sinusoidal": sinusoidal},
|
||||
norm_kwargs=norm_kwargs,
|
||||
dtype=dtype, device=device, operations=operations,
|
||||
))
|
||||
self.transformers = nn.ModuleList(transformers)
|
||||
|
||||
def _get_sliding_window_size(self, window, stride, prepend_cond_length=0):
|
||||
if window is None:
|
||||
return None
|
||||
return [w * (stride + 1 + prepend_cond_length) for w in window]
|
||||
|
||||
def _get_seg_sizes(self, stride, prepend_cond_length=0):
|
||||
sub_chunk_size = stride + 1 + prepend_cond_length
|
||||
input_seg_size = stride if self.type == "encoder" else 1
|
||||
output_seg_size = 1 if self.type == "encoder" else stride
|
||||
return input_seg_size, output_seg_size, sub_chunk_size
|
||||
|
||||
def forward(self, x, stride=None, **kwargs):
|
||||
B = x.shape[0]
|
||||
|
||||
if stride is None:
|
||||
input_seg = self.input_seg_size
|
||||
output_seg = self.output_seg_size
|
||||
sub_chunk = self.sub_chunk_size
|
||||
sliding_window = self.sliding_window_seq
|
||||
else:
|
||||
input_seg, output_seg, sub_chunk = self._get_seg_sizes(stride)
|
||||
sliding_window = self._get_sliding_window_size(self.sliding_window_latents, stride)
|
||||
|
||||
if self.type == "encoder":
|
||||
if self.transformer_depth > 0:
|
||||
pad_mod = self.chunk_size if sliding_window is None else input_seg
|
||||
x = _zero_pad_modulo_sequence(x, pad_mod, dim=-1)
|
||||
x = self.mapping(x)
|
||||
|
||||
if self.transformer_depth > 0:
|
||||
x = x.permute(0, 2, 1)
|
||||
|
||||
if self.type != "encoder":
|
||||
pad_mod = 1 if sliding_window is not None else (
|
||||
self.chunk_size // (stride if stride is not None else self.stride))
|
||||
x = _zero_pad_modulo_sequence(x, pad_mod)
|
||||
|
||||
C = x.shape[2]
|
||||
x = x.reshape(-1, input_seg, C)
|
||||
|
||||
new_tokens = self.new_tokens.expand(x.shape[0], output_seg, -1)
|
||||
x = torch.cat([x, comfy.ops.cast_to_input(new_tokens, x)], dim=-2)
|
||||
del new_tokens
|
||||
|
||||
x = x.reshape(B, -1, C)
|
||||
|
||||
if sliding_window is None:
|
||||
eff_chunk = self.chunk_size + self.chunk_size // (stride if stride is not None else self.stride)
|
||||
|
||||
if sliding_window is None and self.chunk_midpoint_shift:
|
||||
split = self.transformer_depth // 2
|
||||
shift = eff_chunk // 2
|
||||
|
||||
x = x.reshape(-1, eff_chunk, C)
|
||||
for layer in self.transformers[:split]:
|
||||
x = layer(x)
|
||||
x = x.reshape(B, -1, C)
|
||||
|
||||
shifted = torch.cat([x[:, :shift, :], x, x[:, -shift:, :]], dim=1)
|
||||
del x
|
||||
x = shifted.reshape(-1, eff_chunk, C)
|
||||
del shifted
|
||||
for layer in self.transformers[split:]:
|
||||
x = layer(x)
|
||||
x = x.reshape(B, -1, C)
|
||||
x = x[:, shift:-shift, :]
|
||||
elif sliding_window is None:
|
||||
x = x.reshape(-1, eff_chunk, C)
|
||||
for layer in self.transformers:
|
||||
x = layer(x)
|
||||
x = x.reshape(B, -1, C)
|
||||
else:
|
||||
attn_mask = _sliding_window_mask(x.shape[1], sliding_window[0], x.device, x.dtype)
|
||||
for layer in self.transformers:
|
||||
x = layer(x, mask=attn_mask)
|
||||
|
||||
x = x.reshape(-1, sub_chunk, C)
|
||||
x = x[:, -output_seg:, :]
|
||||
x = x.reshape(B, -1, C).transpose(1, 2)
|
||||
|
||||
if self.type == "decoder":
|
||||
x = self.mapping(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class SAMEEncoder(nn.Module):
|
||||
def __init__(self, in_channels=2, channels=128, latent_dim=32,
|
||||
c_mults=(1, 2, 4, 8), strides=(2, 4, 8, 8),
|
||||
transformer_depths=(3, 3, 3, 3),
|
||||
dtype=None, device=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
channel_dims = [in_channels] + [channels * c for c in c_mults]
|
||||
layers = []
|
||||
for i in range(len(c_mults)):
|
||||
layers.append(TransformerResamplingBlock(
|
||||
in_channels=channel_dims[i], out_channels=channel_dims[i + 1],
|
||||
stride=strides[i], type="encoder",
|
||||
transformer_depth=transformer_depths[i],
|
||||
dtype=dtype, device=device, operations=operations, **kwargs))
|
||||
layers += [
|
||||
Transpose(),
|
||||
operations.Linear(channel_dims[-1], latent_dim, dtype=dtype, device=device),
|
||||
Transpose(),
|
||||
]
|
||||
self.layers = nn.ModuleList(layers)
|
||||
|
||||
def forward(self, x, **kwargs):
|
||||
for layer in self.layers:
|
||||
x = layer(x)
|
||||
return x
|
||||
|
||||
|
||||
class SAMEDecoder(nn.Module):
|
||||
def __init__(self, out_channels=2, channels=128, latent_dim=32,
|
||||
c_mults=(1, 2, 4, 8), strides=(2, 4, 8, 8),
|
||||
transformer_depths=(3, 3, 3, 3), sinusoidal_blocks=None,
|
||||
dtype=None, device=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
if sinusoidal_blocks is None:
|
||||
sinusoidal_blocks = [0] * len(c_mults)
|
||||
channel_dims = [out_channels] + [channels * c for c in c_mults]
|
||||
layers = [
|
||||
Transpose(),
|
||||
operations.Linear(latent_dim, channel_dims[-1], dtype=dtype, device=device),
|
||||
Transpose(),
|
||||
]
|
||||
for i in range(len(c_mults) - 1, -1, -1):
|
||||
layers.append(TransformerResamplingBlock(
|
||||
in_channels=channel_dims[i + 1], out_channels=channel_dims[i],
|
||||
stride=strides[i], type="decoder",
|
||||
transformer_depth=transformer_depths[i],
|
||||
sinusoidal_blocks=sinusoidal_blocks[i],
|
||||
dtype=dtype, device=device, operations=operations, **kwargs))
|
||||
self.layers = nn.ModuleList(layers)
|
||||
|
||||
def forward(self, x, **kwargs):
|
||||
for layer in self.layers:
|
||||
x = layer(x)
|
||||
return x
|
||||
|
||||
|
||||
class SoftNormBottleneck(nn.Module):
|
||||
def __init__(self, dim=32, noise_augment_dim=0, noise_regularize=False,
|
||||
auto_scale=False, freeze=False, dtype=None, device=None, **kwargs):
|
||||
super().__init__()
|
||||
self.noise_augment_dim = noise_augment_dim
|
||||
self.noise_regularize = noise_regularize
|
||||
self.scaling_factor = nn.Parameter(torch.empty(1, dim, 1, dtype=dtype, device=device))
|
||||
self.bias = nn.Parameter(torch.empty(1, dim, 1, dtype=dtype, device=device))
|
||||
self.noise_scaling_factor = nn.Parameter(torch.empty(1, noise_augment_dim, 1, dtype=dtype, device=device))
|
||||
if auto_scale:
|
||||
self.register_parameter("running_std", nn.Parameter(
|
||||
torch.empty(1, dtype=dtype, device=device), requires_grad=False))
|
||||
if freeze:
|
||||
for p in self.parameters():
|
||||
p.requires_grad = False
|
||||
|
||||
def encode(self, x, return_info=False, **kwargs):
|
||||
x = x * comfy.ops.cast_to_input(self.scaling_factor, x) \
|
||||
+ comfy.ops.cast_to_input(self.bias, x)
|
||||
if hasattr(self, "running_std"):
|
||||
x = x / comfy.ops.cast_to_input(self.running_std, x)
|
||||
if return_info:
|
||||
return x, {}
|
||||
return x
|
||||
|
||||
def decode(self, x, **kwargs):
|
||||
if hasattr(self, "running_std"):
|
||||
x = x * comfy.ops.cast_to_input(self.running_std, x)
|
||||
if self.noise_regularize:
|
||||
scaling = self.running_std if hasattr(self, "running_std") \
|
||||
else x.std(dim=-1, keepdim=True)
|
||||
noise = torch.randn_like(x) * comfy.ops.cast_to_input(scaling, x) * 1e-3
|
||||
x = x + noise
|
||||
if self.noise_augment_dim > 0:
|
||||
noise = comfy.ops.cast_to_input(self.noise_scaling_factor, x) * torch.randn(
|
||||
x.shape[0], self.noise_augment_dim, x.shape[-1], device=x.device, dtype=x.dtype)
|
||||
x = torch.cat([x, noise], dim=1)
|
||||
return x
|
||||
|
||||
|
||||
class PatchedPretransform(nn.Module):
|
||||
def __init__(self, channels, patch_size, **kwargs):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.patch_size = patch_size
|
||||
self.enable_grad = False
|
||||
|
||||
def _pad(self, x):
|
||||
pad_len = (self.patch_size - x.shape[-1] % self.patch_size) % self.patch_size
|
||||
if pad_len > 0:
|
||||
x = torch.cat([x, torch.zeros_like(x[:, :, :pad_len])], dim=-1)
|
||||
return x
|
||||
|
||||
def encode(self, x):
|
||||
x = self._pad(x)
|
||||
B, C, T = x.shape
|
||||
h = self.patch_size
|
||||
L = T // h
|
||||
# b c (l h) -> b (c h) l
|
||||
return x.reshape(B, C, L, h).permute(0, 1, 3, 2).reshape(B, C * h, L)
|
||||
|
||||
def decode(self, x):
|
||||
B, Ch, L = x.shape
|
||||
h = self.patch_size
|
||||
C = Ch // h
|
||||
# b (c h) l -> b c (l h)
|
||||
return x.reshape(B, C, h, L).permute(0, 1, 3, 2).reshape(B, C, L * h)
|
||||
|
||||
|
||||
class SA3AudioVAE(nn.Module):
|
||||
"""SA3 VAE. State dict keys match checkpoint after stripping 'pretransform.model.'"""
|
||||
|
||||
def __init__(self, channels=256, transformer_depths=12, sinusoidal_blocks=8,
|
||||
sliding_window=None, decoder_conv_mapping=False,
|
||||
chunk_size=128, chunk_midpoint_shift=False,
|
||||
dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
if operations is None:
|
||||
operations = ops
|
||||
|
||||
self.pretransform = PatchedPretransform(channels=2, patch_size=256)
|
||||
|
||||
common_kwargs = dict(
|
||||
differential=True, dyt=True, dim_heads=64,
|
||||
sliding_window=sliding_window, variable_stride=True,
|
||||
chunk_size=chunk_size, chunk_midpoint_shift=chunk_midpoint_shift,
|
||||
dtype=dtype, device=device, operations=operations,
|
||||
)
|
||||
self.encoder = SAMEEncoder(
|
||||
in_channels=512, channels=channels, c_mults=[6], strides=[16],
|
||||
latent_dim=256, transformer_depths=[transformer_depths],
|
||||
conv_mapping=False, **common_kwargs,
|
||||
)
|
||||
self.decoder = SAMEDecoder(
|
||||
out_channels=512, channels=channels, c_mults=[6], strides=[16],
|
||||
latent_dim=256, transformer_depths=[transformer_depths], sinusoidal_blocks=[sinusoidal_blocks],
|
||||
conv_mapping=decoder_conv_mapping, **common_kwargs,
|
||||
)
|
||||
self.bottleneck = SoftNormBottleneck(
|
||||
dim=256, noise_augment_dim=0, noise_regularize=True,
|
||||
auto_scale=True, freeze=True,
|
||||
dtype=dtype, device=device,
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def _pretransform_encode(self, x):
|
||||
return self.pretransform.encode(x)
|
||||
|
||||
@torch.no_grad()
|
||||
def _pretransform_decode(self, x):
|
||||
return self.pretransform.decode(x)
|
||||
|
||||
def encode(self, x):
|
||||
x = self._pretransform_encode(x)
|
||||
x = self.encoder(x)
|
||||
x = self.bottleneck.encode(x)
|
||||
return x
|
||||
|
||||
def decode(self, x):
|
||||
x = self.bottleneck.decode(x)
|
||||
x = self.decoder(x)
|
||||
x = self._pretransform_decode(x)
|
||||
return x
|
||||
@ -328,7 +328,7 @@ class CrossAttention(nn.Module):
|
||||
kv = torch.cat((k, v), dim=-1)
|
||||
split_size = kv.shape[-1] // self.num_heads // 2
|
||||
|
||||
kv = kv.view(1, -1, self.num_heads, split_size * 2)
|
||||
kv = kv.view(b, -1, self.num_heads, split_size * 2)
|
||||
k, v = torch.split(kv, split_size, dim=-1)
|
||||
|
||||
q = q.view(b, s1, self.num_heads, self.head_dim)
|
||||
@ -398,7 +398,7 @@ class Attention(nn.Module):
|
||||
qkv_combined = torch.cat((query, key, value), dim=-1)
|
||||
split_size = qkv_combined.shape[-1] // self.num_heads // 3
|
||||
|
||||
qkv = qkv_combined.view(1, -1, self.num_heads, split_size * 3)
|
||||
qkv = qkv_combined.view(B, -1, self.num_heads, split_size * 3)
|
||||
query, key, value = torch.split(qkv, split_size, dim=-1)
|
||||
|
||||
query = query.reshape(B, N, self.num_heads, self.head_dim)
|
||||
@ -607,9 +607,9 @@ class HunYuanDiTPlain(nn.Module):
|
||||
def forward(self, x, t, context, transformer_options = {}, **kwargs):
|
||||
|
||||
x = x.movedim(-1, -2)
|
||||
uncond_emb, cond_emb = context.chunk(2, dim = 0)
|
||||
|
||||
context = torch.cat([cond_emb, uncond_emb], dim = 0)
|
||||
if context.shape[0] >= 2:
|
||||
uncond_emb, cond_emb = context.chunk(2, dim = 0)
|
||||
context = torch.cat([cond_emb, uncond_emb], dim = 0)
|
||||
main_condition = context
|
||||
|
||||
t = 1.0 - t
|
||||
@ -657,5 +657,8 @@ class HunYuanDiTPlain(nn.Module):
|
||||
output = self.final_layer(combined)
|
||||
output = output.movedim(-2, -1) * (-1.0)
|
||||
|
||||
cond_emb, uncond_emb = output.chunk(2, dim = 0)
|
||||
return torch.cat([uncond_emb, cond_emb])
|
||||
if output.shape[0] >= 2:
|
||||
cond_emb, uncond_emb = output.chunk(2, dim = 0)
|
||||
return torch.cat([uncond_emb, cond_emb])
|
||||
else:
|
||||
return output
|
||||
|
||||
@ -813,6 +813,85 @@ class StableAudio1(BaseModel):
|
||||
sd["{}{}".format(k, l)] = s[l]
|
||||
return sd
|
||||
|
||||
class StableAudio3(BaseModel):
|
||||
def __init__(self, model_config, seconds_total_embedder_weights, padding_embedding=None, model_type=ModelType.FLOW, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.audio.dit.AudioDiffusionTransformer)
|
||||
self.seconds_total_embedder = comfy.ldm.audio.embedders.NumberConditioner(768, min_val=0, max_val=384, fourier_features_type=model_config.unet_config["timestep_features_type"])
|
||||
self.seconds_total_embedder.load_state_dict(seconds_total_embedder_weights)
|
||||
if padding_embedding is not None:
|
||||
self.padding_embedding = torch.nn.Parameter(padding_embedding, requires_grad=False)
|
||||
else:
|
||||
self.padding_embedding = None
|
||||
|
||||
def concat_cond(self, **kwargs):
|
||||
noise = kwargs.get("noise", None)
|
||||
image = kwargs.get("concat_latent_image", None)
|
||||
|
||||
if image is None:
|
||||
shape_image = list(noise.shape)
|
||||
image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device)
|
||||
else:
|
||||
image = self.process_latent_in(image)
|
||||
# TODO: scale if not match
|
||||
image = utils.resize_to_batch_size(image, noise.shape[0])
|
||||
|
||||
mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
|
||||
if mask is None:
|
||||
mask = torch.zeros_like(noise)[:, :1]
|
||||
else:
|
||||
if mask.shape[1] != 1:
|
||||
mask = torch.mean(mask, dim=1, keepdim=True)
|
||||
mask = 1.0 - mask
|
||||
# TODO: scale if not match
|
||||
mask = utils.resize_to_batch_size(mask, noise.shape[0])
|
||||
|
||||
return torch.cat((mask, image), dim=1)
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = {}
|
||||
|
||||
concat_cond = self.concat_cond(**kwargs)
|
||||
if concat_cond is not None:
|
||||
out['local_add_cond'] = comfy.conds.CONDNoiseShape(concat_cond)
|
||||
|
||||
noise = kwargs.get("noise", None)
|
||||
device = kwargs["device"]
|
||||
|
||||
seconds_total = kwargs.get("seconds_total", int(noise.shape[-1] / 10.7666))
|
||||
seconds_total_embed = self.seconds_total_embedder([seconds_total])[0].to(device)
|
||||
|
||||
global_embed = seconds_total_embed.reshape((1, -1))
|
||||
out['global_embed'] = comfy.conds.CONDRegular(global_embed)
|
||||
|
||||
cross_attn = kwargs.get("cross_attn", None)
|
||||
if cross_attn is not None:
|
||||
cross_attn = cross_attn.to(device)
|
||||
if self.padding_embedding is not None:
|
||||
pe = self.padding_embedding.to(device=device, dtype=cross_attn.dtype)
|
||||
max_text_tokens = self.model_config.unet_config.get("max_text_tokens", 256)
|
||||
n_text = cross_attn.shape[1]
|
||||
if n_text < max_text_tokens:
|
||||
pad = pe.view(1, 1, -1).expand(cross_attn.shape[0], max_text_tokens - n_text, -1)
|
||||
cross_attn = torch.cat([cross_attn, pad], dim=1)
|
||||
cross_attn = torch.cat([cross_attn, seconds_total_embed.repeat((cross_attn.shape[0], 1, 1))], dim=1)
|
||||
out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
|
||||
|
||||
return out
|
||||
|
||||
def state_dict_for_saving(self, unet_state_dict, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None):
|
||||
sd = super().state_dict_for_saving(unet_state_dict, clip_state_dict=clip_state_dict, vae_state_dict=vae_state_dict, clip_vision_state_dict=clip_vision_state_dict)
|
||||
|
||||
d = {"conditioner.conditioners.seconds_total.": self.seconds_total_embedder.state_dict()}
|
||||
|
||||
for k in d:
|
||||
s = d[k]
|
||||
for l in s:
|
||||
sd["{}{}".format(k, l)] = s[l]
|
||||
|
||||
if self.padding_embedding is not None:
|
||||
sd["conditioner.conditioners.prompt.padding_embedding"] = self.padding_embedding.data
|
||||
return sd
|
||||
|
||||
|
||||
class HunyuanDiT(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.V_PREDICTION, device=None):
|
||||
|
||||
@ -116,6 +116,45 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
if '{}transformer.rotary_pos_emb.inv_freq'.format(key_prefix) in state_dict_keys: #stable audio dit
|
||||
unet_config = {}
|
||||
unet_config["audio_model"] = "dit1.0"
|
||||
unet_config["global_cond_dim"] = state_dict['{}to_global_embed.0.weight'.format(key_prefix)].shape[1]
|
||||
cond_embed = state_dict['{}to_cond_embed.0.weight'.format(key_prefix)]
|
||||
unet_config["project_cond_tokens"] = cond_embed.shape[0] != cond_embed.shape[1]
|
||||
unet_config["embed_dim"] = state_dict['{}to_timestep_embed.0.weight'.format(key_prefix)].shape[0]
|
||||
mem_tokens = state_dict.get('{}transformer.memory_tokens'.format(key_prefix), None)
|
||||
to_qkv = state_dict.get('{}transformer.layers.0.self_attn.to_qkv.weight'.format(key_prefix), None)
|
||||
differential = False
|
||||
if to_qkv is not None:
|
||||
if to_qkv.shape[0] == to_qkv.shape[1] * 5:
|
||||
differential = True
|
||||
if mem_tokens is not None:
|
||||
unet_config["num_memory_tokens"] = mem_tokens.shape[0]
|
||||
if '{}transformer.layers.0.self_attn.q_norm.weight'.format(key_prefix) in state_dict:
|
||||
unet_config["attn_kwargs"] = {"qk_norm": "ln", "feat_scale": True}
|
||||
rms_norm = state_dict.get('{}transformer.layers.0.self_attn.q_norm.gamma'.format(key_prefix), None)
|
||||
if rms_norm is not None:
|
||||
unet_config["attn_kwargs"] = {"qk_norm": "rms", "differential": differential}
|
||||
unet_config["norm_type"] = "rms_norm"
|
||||
unet_config["num_heads"] = unet_config["embed_dim"] // rms_norm.shape[0]
|
||||
|
||||
if '{}timestep_features.weight'.format(key_prefix) in state_dict:
|
||||
unet_config["timestep_features_type"] = "learned"
|
||||
else:
|
||||
unet_config["timestep_features_type"] = "expo"
|
||||
|
||||
io_channels = state_dict['{}postprocess_conv.weight'.format(key_prefix)].shape[0]
|
||||
unet_config["io_channels"] = io_channels
|
||||
unet_config["input_concat_dim"] = state_dict['{}transformer.project_in.weight'.format(key_prefix)].shape[1] - io_channels
|
||||
|
||||
local_add_cond = state_dict.get('{}transformer.layers.0.to_local_embed.0.weight'.format(key_prefix), None)
|
||||
if local_add_cond is not None:
|
||||
unet_config["local_add_cond_dim"] = local_add_cond.shape[1]
|
||||
|
||||
global_cond_embed = state_dict.get('{}transformer.global_cond_embedder.0.weight'.format(key_prefix), None)
|
||||
if global_cond_embed is not None:
|
||||
unet_config["global_cond_shared_embed"] = True
|
||||
unet_config["global_cond_type"] = "adaLN"
|
||||
|
||||
unet_config["depth"] = count_blocks(state_dict_keys, '{}transformer.layers.'.format(key_prefix) + '{}.')
|
||||
return unet_config
|
||||
|
||||
if '{}double_layers.0.attn.w1q.weight'.format(key_prefix) in state_dict_keys: #aura flow dit
|
||||
|
||||
@ -260,7 +260,7 @@ def resolve_cast_module_with_vbar(s, dtype, device, bias_dtype, compute_dtype, w
|
||||
|
||||
|
||||
def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, offloadable=False, compute_dtype=None, want_requant=False):
|
||||
# NOTE: offloadable=False is a a legacy and if you are a custom node author reading this please pass
|
||||
# NOTE: offloadable=False is a legacy mode and if you are a custom node author reading this please pass
|
||||
# offloadable=True and call uncast_bias_weight() after your last usage of the weight/bias. This
|
||||
# will add async-offload support to your cast and improve performance.
|
||||
if input is not None:
|
||||
|
||||
37
comfy/sd.py
37
comfy/sd.py
@ -21,6 +21,7 @@ import comfy.ldm.ace.vae.music_dcae_pipeline
|
||||
import comfy.ldm.cogvideo.vae
|
||||
import comfy.ldm.hunyuan_video.vae
|
||||
import comfy.ldm.mmaudio.vae.autoencoder
|
||||
import comfy.ldm.audio.vae_sa3
|
||||
import comfy.pixel_space_convert
|
||||
import comfy.weight_adapter
|
||||
import yaml
|
||||
@ -67,6 +68,7 @@ import comfy.text_encoders.qwen35
|
||||
import comfy.text_encoders.ernie
|
||||
import comfy.text_encoders.gemma4
|
||||
import comfy.text_encoders.cogvideo
|
||||
import comfy.text_encoders.sa3
|
||||
|
||||
import comfy.model_patcher
|
||||
import comfy.lora
|
||||
@ -854,6 +856,34 @@ class VAE:
|
||||
self.working_dtypes = [torch.float32]
|
||||
self.disable_offload = True
|
||||
self.extra_1d_channel = 16
|
||||
elif "decoder.layers.3.transformers.0.pre_norm.alpha" in sd: # Stable Audio 3 VAE
|
||||
if "decoder.layers.3.transformers.11.self_attn.to_out.weight" in sd:
|
||||
config = {"channels": 256, "transformer_depths": 12, "sinusoidal_blocks": 8,
|
||||
"sliding_window": [1, 1], "decoder_conv_mapping": False,
|
||||
"chunk_size": 128, "chunk_midpoint_shift": False}
|
||||
self.memory_used_encode = lambda shape, dtype: (1500 * shape[2]) * model_management.dtype_size(dtype)
|
||||
self.memory_used_decode = lambda shape, dtype: (1500 * shape[2] * 4096) * model_management.dtype_size(dtype)
|
||||
else:
|
||||
config = {"channels": 128, "transformer_depths": 6, "sinusoidal_blocks": 0,
|
||||
"sliding_window": None, "decoder_conv_mapping": True,
|
||||
"chunk_size": 32, "chunk_midpoint_shift": True}
|
||||
self.memory_used_encode = lambda shape, dtype: (72 * shape[2]) * model_management.dtype_size(dtype)
|
||||
self.memory_used_decode = lambda shape, dtype: (72 * shape[2] * 4096) * model_management.dtype_size(dtype)
|
||||
|
||||
self.first_stage_model = comfy.ldm.audio.vae_sa3.SA3AudioVAE(**config)
|
||||
self.latent_channels = 256
|
||||
self.output_channels = 2
|
||||
self.upscale_ratio = 4096
|
||||
self.downscale_ratio = 4096
|
||||
self.latent_dim = 1
|
||||
self.audio_sample_rate = 44100
|
||||
self.process_output = lambda audio: audio
|
||||
self.process_input = lambda audio: audio
|
||||
self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
|
||||
#This VAE has Parameters and Buffers the non-dynamic caster cannot handle
|
||||
#Force cast it for --disable-dynamic-vram users until there is a true core fix.
|
||||
if not comfy.memory_management.aimdo_enabled:
|
||||
self.disable_offload = True
|
||||
else:
|
||||
logging.warning("WARNING: No VAE weights detected, VAE not initalized.")
|
||||
self.first_stage_model = None
|
||||
@ -1290,6 +1320,7 @@ class TEModel(Enum):
|
||||
GEMMA_4_E4B = 29
|
||||
GEMMA_4_E2B = 30
|
||||
GEMMA_4_31B = 31
|
||||
T5_GEMMA = 32
|
||||
|
||||
|
||||
def detect_te_model(sd):
|
||||
@ -1314,6 +1345,8 @@ def detect_te_model(sd):
|
||||
if weight.shape[0] == 384:
|
||||
return TEModel.BYT5_SMALL_GLYPH
|
||||
return TEModel.T5_BASE
|
||||
if "model.encoder.layers.0.pre_self_attn_layernorm.weight" in sd:
|
||||
return TEModel.T5_GEMMA
|
||||
if 'model.layers.0.post_feedforward_layernorm.weight' in sd:
|
||||
if 'model.layers.59.self_attn.q_norm.weight' in sd:
|
||||
return TEModel.GEMMA_4_31B
|
||||
@ -1463,6 +1496,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
else:
|
||||
clip_target.clip = comfy.text_encoders.sa_t5.SAT5Model
|
||||
clip_target.tokenizer = comfy.text_encoders.sa_t5.SAT5Tokenizer
|
||||
elif te_model == TEModel.T5_GEMMA:
|
||||
clip_target.clip = comfy.text_encoders.sa3.SAT5GemmaModel
|
||||
clip_target.tokenizer = comfy.text_encoders.sa3.SAT5GemmaTokenizer
|
||||
tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
|
||||
elif te_model in (TEModel.GEMMA_4_E4B, TEModel.GEMMA_4_E2B, TEModel.GEMMA_4_31B):
|
||||
variant = {TEModel.GEMMA_4_E4B: comfy.text_encoders.gemma4.Gemma4_E4B,
|
||||
TEModel.GEMMA_4_E2B: comfy.text_encoders.gemma4.Gemma4_E2B,
|
||||
|
||||
@ -7,6 +7,7 @@ from . import sdxl_clip
|
||||
import comfy.text_encoders.sd2_clip
|
||||
import comfy.text_encoders.sd3_clip
|
||||
import comfy.text_encoders.sa_t5
|
||||
import comfy.text_encoders.sa3
|
||||
import comfy.text_encoders.aura_t5
|
||||
import comfy.text_encoders.pixart_t5
|
||||
import comfy.text_encoders.hydit
|
||||
@ -603,6 +604,29 @@ class StableAudio(supported_models_base.BASE):
|
||||
def clip_target(self, state_dict={}):
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.sa_t5.SAT5Tokenizer, comfy.text_encoders.sa_t5.SAT5Model)
|
||||
|
||||
class StableAudio3(StableAudio):
|
||||
unet_config = {
|
||||
"audio_model": "dit1.0",
|
||||
"global_cond_shared_embed": True,
|
||||
}
|
||||
|
||||
sampling_settings = {
|
||||
"multiplier": 1.0,
|
||||
"shift": 2.0,
|
||||
}
|
||||
|
||||
latent_format = latent_formats.StableAudio3
|
||||
|
||||
memory_usage_factor = 7
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
seconds_total_sd = utils.state_dict_prefix_replace(state_dict, {"conditioner.conditioners.seconds_total.": ""}, filter_keys=True)
|
||||
padding_embedding = state_dict.get("conditioner.conditioners.prompt.padding_embedding", None)
|
||||
return model_base.StableAudio3(self, seconds_total_embedder_weights=seconds_total_sd, padding_embedding=padding_embedding, device=device)
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.sa3.SAT5GemmaTokenizer, comfy.text_encoders.sa3.SAT5GemmaModel)
|
||||
|
||||
class AuraFlow(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"cond_seq_dim": 2048,
|
||||
@ -2018,6 +2042,7 @@ models = [
|
||||
SV3D_u,
|
||||
SV3D_p,
|
||||
SD3,
|
||||
StableAudio3,
|
||||
StableAudio,
|
||||
AuraFlow,
|
||||
PixArtAlpha,
|
||||
|
||||
207
comfy/text_encoders/sa3.py
Normal file
207
comfy/text_encoders/sa3.py
Normal file
@ -0,0 +1,207 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from comfy import sd1_clip
|
||||
from comfy.text_encoders.llama import Attention as LlamaAttention, RMSNorm, MLP, precompute_freqs_cis, apply_rope, _make_scaled_embedding
|
||||
from comfy.text_encoders.spiece_tokenizer import SPieceTokenizer
|
||||
|
||||
|
||||
class T5GemmaEncoderConfig:
|
||||
def __init__(self):
|
||||
self.vocab_size = 256000
|
||||
self.hidden_size = 768
|
||||
self.intermediate_size = 2048
|
||||
self.num_hidden_layers = 12
|
||||
self.num_attention_heads = 12
|
||||
self.num_key_value_heads = 12
|
||||
self.head_dim = 64
|
||||
self.rms_norm_eps = 1e-6
|
||||
self.rms_norm_add = False
|
||||
self.rope_theta = 10000.0
|
||||
self.attn_logit_softcapping = 50.0
|
||||
self.query_pre_attn_scalar = 64
|
||||
self.sliding_window = 4096
|
||||
self.mlp_activation = "gelu_pytorch_tanh"
|
||||
self.layer_types = ["sliding_attention", "full_attention"] * 6
|
||||
self.qkv_bias = False
|
||||
self.q_norm = None
|
||||
self.k_norm = None
|
||||
self.rms_norm_add = True
|
||||
|
||||
|
||||
class T5GemmaAttention(LlamaAttention):
|
||||
"""Reuses LlamaAttention projection setup; overrides forward for softcap attention.
|
||||
|
||||
T5Gemma applies tanh(QK^T * scale / cap) * cap between the matmul and softmax.
|
||||
This nonlinearity is incompatible with fused SDPA kernels, so attention is
|
||||
computed manually. Everything else (projections, RoPE, GQA expansion) is identical
|
||||
to LlamaAttention so __init__ is inherited unchanged.
|
||||
"""
|
||||
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
super().__init__(config, device=device, dtype=dtype, ops=ops)
|
||||
self.scale = config.query_pre_attn_scalar ** -0.5
|
||||
self.softcap = config.attn_logit_softcapping
|
||||
|
||||
def forward(self, hidden_states, attention_mask=None, freqs_cis=None, **kwargs):
|
||||
B, S, _ = hidden_states.shape
|
||||
xq = self.q_proj(hidden_states).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
|
||||
xk = self.k_proj(hidden_states).view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2)
|
||||
xv = self.v_proj(hidden_states).view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2)
|
||||
xq, xk = apply_rope(xq, xk, freqs_cis)
|
||||
xk = xk.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
|
||||
xv = xv.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
|
||||
attn = torch.matmul(xq * self.scale, xk.transpose(-2, -1))
|
||||
attn = torch.tanh(attn / self.softcap) * self.softcap
|
||||
if attention_mask is not None:
|
||||
attn = attn + attention_mask
|
||||
attn = torch.nn.functional.softmax(attn.float(), dim=-1).to(xq.dtype)
|
||||
out = torch.matmul(attn, xv).transpose(1, 2).reshape(B, S, self.inner_size)
|
||||
return self.o_proj(out), None
|
||||
|
||||
|
||||
class T5GemmaBlock(nn.Module):
|
||||
def __init__(self, config, layer_type, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.self_attn = T5GemmaAttention(config, device=device, dtype=dtype, ops=ops)
|
||||
self.mlp = MLP(config, device=device, dtype=dtype, ops=ops)
|
||||
# Names match checkpoint keys: model.encoder.layers.X.<name>.weight
|
||||
self.pre_self_attn_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=True, device=device, dtype=dtype)
|
||||
self.post_self_attn_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=True, device=device, dtype=dtype)
|
||||
self.pre_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=True, device=device, dtype=dtype)
|
||||
self.post_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=True, device=device, dtype=dtype)
|
||||
self.is_sliding = (layer_type == "sliding_attention")
|
||||
self.sliding_window = config.sliding_window
|
||||
|
||||
def forward(self, x, attention_mask=None, freqs_cis=None):
|
||||
attn_mask = attention_mask
|
||||
if self.is_sliding and x.shape[1] > self.sliding_window:
|
||||
S = x.shape[1]
|
||||
pos = torch.arange(S, device=x.device)
|
||||
dist = (pos.unsqueeze(0) - pos.unsqueeze(1)).abs()
|
||||
sw_mask = torch.zeros(S, S, dtype=x.dtype, device=x.device)
|
||||
sw_mask.masked_fill_(dist > self.sliding_window, -torch.finfo(x.dtype).max)
|
||||
sw_mask = sw_mask.unsqueeze(0).unsqueeze(0)
|
||||
attn_mask = (attention_mask + sw_mask) if attention_mask is not None else sw_mask
|
||||
residual = x
|
||||
x = self.pre_self_attn_layernorm(x)
|
||||
x, _ = self.self_attn(x, attention_mask=attn_mask, freqs_cis=freqs_cis)
|
||||
x = self.post_self_attn_layernorm(x)
|
||||
x = residual + x
|
||||
residual = x
|
||||
x = self.pre_feedforward_layernorm(x)
|
||||
x = self.mlp(x)
|
||||
x = self.post_feedforward_layernorm(x)
|
||||
x = residual + x
|
||||
return x
|
||||
|
||||
|
||||
class T5GemmaEncoder(nn.Module):
|
||||
"""Encoder stack: embed_tokens, layers, norm.
|
||||
Keys: embed_tokens.*, layers.X.*, norm.*"""
|
||||
|
||||
def __init__(self, config, device, dtype, ops):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
# Gemma-style scaled embedding: output *= sqrt(hidden_size)
|
||||
self.embed_tokens = _make_scaled_embedding(
|
||||
ops, config.vocab_size, config.hidden_size, config.hidden_size ** 0.5, device, dtype)
|
||||
self.layers = nn.ModuleList([
|
||||
T5GemmaBlock(config, config.layer_types[i], device=device, dtype=dtype, ops=ops)
|
||||
for i in range(config.num_hidden_layers)
|
||||
])
|
||||
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, input_ids, attention_mask=None, embeds=None, intermediate_output=None,
|
||||
final_layer_norm_intermediate=True, dtype=None, num_layers=None):
|
||||
x = embeds if embeds is not None else self.embed_tokens(input_ids, out_dtype=dtype or torch.float32)
|
||||
seq_len = x.shape[1]
|
||||
position_ids = torch.arange(seq_len, device=x.device).unsqueeze(0)
|
||||
freqs_cis = precompute_freqs_cis(self.config.head_dim, position_ids, self.config.rope_theta, device=x.device)
|
||||
mask = None
|
||||
if attention_mask is not None:
|
||||
mask = 1.0 - attention_mask.to(x.dtype).reshape(
|
||||
(attention_mask.shape[0], 1, -1, attention_mask.shape[-1])
|
||||
).expand(attention_mask.shape[0], 1, seq_len, attention_mask.shape[-1])
|
||||
mask = mask.masked_fill(mask.to(torch.bool), -torch.finfo(x.dtype).max)
|
||||
intermediate = None
|
||||
for i, layer in enumerate(self.layers):
|
||||
x = layer(x, attention_mask=mask, freqs_cis=freqs_cis)
|
||||
if i == intermediate_output:
|
||||
intermediate = x.clone()
|
||||
x = self.norm(x)
|
||||
if intermediate is not None and final_layer_norm_intermediate:
|
||||
intermediate = self.norm(intermediate)
|
||||
return x, intermediate
|
||||
|
||||
|
||||
class T5GemmaBody(nn.Module):
|
||||
"""Provides the 'encoder' sub-module.
|
||||
Keys: encoder.*"""
|
||||
|
||||
def __init__(self, config, device, dtype, ops):
|
||||
super().__init__()
|
||||
self.encoder = T5GemmaEncoder(config, device, dtype, ops)
|
||||
|
||||
|
||||
class T5GemmaModel(nn.Module):
|
||||
"""Top-level model class passed to SDClipModel as model_class.
|
||||
Module layout: self.model.encoder.* → matches checkpoint keys model.encoder.*"""
|
||||
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
super().__init__()
|
||||
config = T5GemmaEncoderConfig()
|
||||
self.num_layers = config.num_hidden_layers
|
||||
self.dtype = dtype
|
||||
self.model = T5GemmaBody(config, device, dtype, operations)
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.model.encoder.embed_tokens
|
||||
|
||||
def set_input_embeddings(self, embeddings):
|
||||
self.model.encoder.embed_tokens = embeddings
|
||||
|
||||
def forward(self, input_ids, attention_mask=None, embeds=None, num_tokens=None,
|
||||
intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, **kwargs):
|
||||
if intermediate_output is not None and intermediate_output < 0:
|
||||
intermediate_output = self.num_layers + intermediate_output
|
||||
return self.model.encoder(
|
||||
input_ids, attention_mask=attention_mask, embeds=embeds,
|
||||
intermediate_output=intermediate_output,
|
||||
final_layer_norm_intermediate=final_layer_norm_intermediate,
|
||||
dtype=dtype, num_layers=self.num_layers)
|
||||
|
||||
|
||||
class T5GemmaSDClipModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx,
|
||||
textmodel_json_config={}, dtype=dtype,
|
||||
special_tokens={"pad": 0},
|
||||
model_class=T5GemmaModel,
|
||||
enable_attention_masks=True, zero_out_masked=True,
|
||||
model_options=model_options)
|
||||
|
||||
|
||||
class T5GemmaSDTokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
tokenizer_model = tokenizer_data.get("spiece_model", None)
|
||||
super().__init__(tokenizer_model, pad_with_end=False, embedding_size=768,
|
||||
embedding_key="t5gemma", tokenizer_class=SPieceTokenizer,
|
||||
has_start_token=False, has_end_token=False, pad_to_max_length=False,
|
||||
max_length=99999999, min_length=1, pad_token=0,
|
||||
tokenizer_data=tokenizer_data,
|
||||
tokenizer_args={"add_bos": False, "add_eos": False})
|
||||
|
||||
def state_dict(self):
|
||||
return {"spiece_model": self.tokenizer.serialize_model()}
|
||||
|
||||
|
||||
class SAT5GemmaTokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory,
|
||||
tokenizer_data=tokenizer_data, clip_name="t5gemma", tokenizer=T5GemmaSDTokenizer)
|
||||
|
||||
|
||||
class SAT5GemmaModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options,
|
||||
name="t5gemma", clip_model=T5GemmaSDClipModel, **kwargs)
|
||||
@ -35,6 +35,19 @@ class AnthropicMessage(BaseModel):
|
||||
content: list[AnthropicTextContent | AnthropicImageContent] = Field(...)
|
||||
|
||||
|
||||
class AnthropicThinkingConfig(BaseModel):
|
||||
type: Literal["enabled", "disabled", "adaptive"] = Field(...)
|
||||
budget_tokens: int | None = Field(
|
||||
None, ge=1024,
|
||||
description="Reasoning budget in tokens. Used when type is 'enabled'. Must be less than max_tokens.",
|
||||
)
|
||||
|
||||
|
||||
class AnthropicOutputConfig(BaseModel):
|
||||
"""Used with `thinking.type='adaptive'` on models like Opus 4.7."""
|
||||
effort: Literal["low", "medium", "high"] | None = Field(None)
|
||||
|
||||
|
||||
class AnthropicMessagesRequest(BaseModel):
|
||||
model: str = Field(...)
|
||||
messages: list[AnthropicMessage] = Field(...)
|
||||
@ -44,6 +57,8 @@ class AnthropicMessagesRequest(BaseModel):
|
||||
top_p: float | None = Field(None, ge=0.0, le=1.0)
|
||||
top_k: int | None = Field(None, ge=0)
|
||||
stop_sequences: list[str] | None = Field(None)
|
||||
thinking: AnthropicThinkingConfig | None = Field(None)
|
||||
output_config: AnthropicOutputConfig | None = Field(None)
|
||||
|
||||
|
||||
class AnthropicResponseTextBlock(BaseModel):
|
||||
@ -51,6 +66,14 @@ class AnthropicResponseTextBlock(BaseModel):
|
||||
text: str = Field(...)
|
||||
|
||||
|
||||
class AnthropicResponseThinkingBlock(BaseModel):
|
||||
type: Literal["thinking"] = "thinking"
|
||||
thinking: str = Field(...)
|
||||
|
||||
|
||||
AnthropicResponseBlock = AnthropicResponseTextBlock | AnthropicResponseThinkingBlock
|
||||
|
||||
|
||||
class AnthropicCacheCreationUsage(BaseModel):
|
||||
ephemeral_5m_input_tokens: int | None = Field(None)
|
||||
ephemeral_1h_input_tokens: int | None = Field(None)
|
||||
@ -69,7 +92,7 @@ class AnthropicMessagesResponse(BaseModel):
|
||||
type: str | None = Field(None)
|
||||
role: str | None = Field(None)
|
||||
model: str | None = Field(None)
|
||||
content: list[AnthropicResponseTextBlock] | None = Field(None)
|
||||
content: list[AnthropicResponseBlock] | None = Field(None)
|
||||
stop_reason: str | None = Field(None)
|
||||
stop_sequence: str | None = Field(None)
|
||||
usage: AnthropicMessagesUsage | None = Field(None)
|
||||
|
||||
93
comfy_api_nodes/apis/openrouter.py
Normal file
93
comfy_api_nodes/apis/openrouter.py
Normal file
@ -0,0 +1,93 @@
|
||||
"""Pydantic models for the OpenRouter chat completions API.
|
||||
|
||||
See: https://openrouter.ai/docs/api/api-reference/chat/send-chat-completion-request
|
||||
"""
|
||||
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class OpenRouterTextContent(BaseModel):
|
||||
type: Literal["text"] = "text"
|
||||
text: str = Field(...)
|
||||
|
||||
|
||||
class OpenRouterImageUrl(BaseModel):
|
||||
url: str = Field(...)
|
||||
|
||||
|
||||
class OpenRouterImageContent(BaseModel):
|
||||
type: Literal["image_url"] = "image_url"
|
||||
image_url: OpenRouterImageUrl = Field(...)
|
||||
|
||||
|
||||
class OpenRouterVideoUrl(BaseModel):
|
||||
url: str = Field(...)
|
||||
|
||||
|
||||
class OpenRouterVideoContent(BaseModel):
|
||||
type: Literal["video_url"] = "video_url"
|
||||
video_url: OpenRouterVideoUrl = Field(...)
|
||||
|
||||
|
||||
OpenRouterContentBlock = OpenRouterTextContent | OpenRouterImageContent | OpenRouterVideoContent
|
||||
|
||||
|
||||
class OpenRouterMessage(BaseModel):
|
||||
role: Literal["system", "user", "assistant"] = Field(...)
|
||||
content: str | list[OpenRouterContentBlock] = Field(...)
|
||||
|
||||
|
||||
class OpenRouterReasoningConfig(BaseModel):
|
||||
effort: str | None = Field(None)
|
||||
exclude: bool | None = Field(None, description="If true, model reasons but reasoning is excluded from response.")
|
||||
|
||||
|
||||
class OpenRouterWebSearchOptions(BaseModel):
|
||||
search_context_size: str | None = Field(None)
|
||||
|
||||
|
||||
class OpenRouterChatRequest(BaseModel):
|
||||
model: str = Field(...)
|
||||
messages: list[OpenRouterMessage] = Field(...)
|
||||
seed: int | None = Field(None)
|
||||
reasoning: OpenRouterReasoningConfig | None = Field(None)
|
||||
web_search_options: OpenRouterWebSearchOptions | None = Field(None)
|
||||
stream: bool = Field(False)
|
||||
|
||||
|
||||
class OpenRouterUsage(BaseModel):
|
||||
prompt_tokens: int | None = Field(None)
|
||||
completion_tokens: int | None = Field(None)
|
||||
total_tokens: int | None = Field(None)
|
||||
cost: float | None = Field(None, description="Server-side authoritative USD cost of the call.")
|
||||
|
||||
|
||||
class OpenRouterResponseMessage(BaseModel):
|
||||
role: str | None = Field(None)
|
||||
content: str | None = Field(None)
|
||||
reasoning: str | None = Field(None)
|
||||
refusal: str | None = Field(None)
|
||||
|
||||
|
||||
class OpenRouterChoice(BaseModel):
|
||||
index: int | None = Field(None)
|
||||
message: OpenRouterResponseMessage | None = Field(None)
|
||||
finish_reason: str | None = Field(None)
|
||||
|
||||
|
||||
class OpenRouterError(BaseModel):
|
||||
code: int | str | None = Field(None)
|
||||
message: str | None = Field(None)
|
||||
metadata: dict | None = Field(None)
|
||||
|
||||
|
||||
class OpenRouterChatResponse(BaseModel):
|
||||
id: str | None = Field(None)
|
||||
model: str | None = Field(None)
|
||||
object: str | None = Field(None)
|
||||
provider: str | None = Field(None)
|
||||
choices: list[OpenRouterChoice] | None = Field(None)
|
||||
usage: OpenRouterUsage | None = Field(None)
|
||||
error: OpenRouterError | None = Field(None)
|
||||
@ -1,7 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from enum import Enum
|
||||
from typing import Optional, List
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
@ -11,44 +9,76 @@ class Rodin3DGenerateRequest(BaseModel):
|
||||
material: str = Field(..., description="The material type.")
|
||||
quality_override: int = Field(..., description="The poly count of the mesh.")
|
||||
mesh_mode: str = Field(..., description="It controls the type of faces of generated models.")
|
||||
TAPose: Optional[bool] = Field(None, description="")
|
||||
TAPose: bool | None = Field(None, description="")
|
||||
|
||||
|
||||
class Rodin3DGen25Request(BaseModel):
|
||||
|
||||
tier: str = Field(..., description="Gen-2.5 tier (e.g. Gen-2.5-High).")
|
||||
prompt: str | None = Field(None, description="Required for Text-to-3D; ignored otherwise.")
|
||||
seed: int | None = Field(None, description="0-65535.")
|
||||
material: str | None = Field(None, description="PBR | Shaded | All | None.")
|
||||
geometry_file_format: str | None = Field(None, description="glb | usdz | fbx | obj | stl.")
|
||||
texture_mode: str | None = Field(None, description="legacy | extreme-low | low | medium | high.")
|
||||
mesh_mode: str | None = Field(None, description="Raw (triangular) | Quad.")
|
||||
quality_override: int | None = Field(None, description="Mesh face count override.")
|
||||
geometry_instruct_mode: str | None = Field(None, description="faithful | creative.")
|
||||
bbox_condition: list[int] | None = Field(None, description="Bounding box [Width(Y), Height(Z), Length(X)] in cm.")
|
||||
height: int | None = Field(None, description="Approximate model height in cm.")
|
||||
TAPose: bool | None = Field(None, description="T/A pose for human-like models.")
|
||||
hd_texture: bool | None = Field(None, description="Enhanced texture quality.")
|
||||
texture_delight: bool | None = Field(None, description="Remove baked lighting from textures.")
|
||||
is_micro: bool | None = Field(None, description="Micro detail (Extreme-High only).")
|
||||
use_original_alpha: bool | None = Field(None, description="Preserve image transparency.")
|
||||
preview_render: bool | None = Field(None, description="Generate high-quality preview render.")
|
||||
addons: list[str] | None = Field(None, description='Optional addons, e.g. ["HighPack"].')
|
||||
|
||||
|
||||
class GenerateJobsData(BaseModel):
|
||||
uuids: List[str] = Field(..., description="str LIST")
|
||||
uuids: list[str] = Field(..., description="str LIST")
|
||||
subscription_key: str = Field(..., description="subscription key")
|
||||
|
||||
|
||||
class Rodin3DGenerateResponse(BaseModel):
|
||||
message: Optional[str] = Field(None, description="Return message.")
|
||||
prompt: Optional[str] = Field(None, description="Generated Prompt from image.")
|
||||
submit_time: Optional[str] = Field(None, description="Submit Time")
|
||||
uuid: Optional[str] = Field(None, description="Task str")
|
||||
jobs: Optional[GenerateJobsData] = Field(None, description="Details of jobs")
|
||||
message: str | None = Field(None, description="Return message.")
|
||||
prompt: str | None = Field(None, description="Generated Prompt from image.")
|
||||
submit_time: str | None = Field(None, description="Submit Time")
|
||||
uuid: str | None = Field(None, description="Task str")
|
||||
jobs: GenerateJobsData | None = Field(None, description="Details of jobs")
|
||||
|
||||
|
||||
class JobStatus(str, Enum):
|
||||
"""
|
||||
Status for jobs
|
||||
"""
|
||||
|
||||
Done = "Done"
|
||||
Failed = "Failed"
|
||||
Generating = "Generating"
|
||||
Waiting = "Waiting"
|
||||
|
||||
|
||||
class Rodin3DCheckStatusRequest(BaseModel):
|
||||
subscription_key: str = Field(..., description="subscription from generate endpoint")
|
||||
|
||||
|
||||
class JobItem(BaseModel):
|
||||
uuid: str = Field(..., description="uuid")
|
||||
status: JobStatus = Field(...,description="Status Currently")
|
||||
status: JobStatus = Field(..., description="Status Currently")
|
||||
|
||||
|
||||
class Rodin3DCheckStatusResponse(BaseModel):
|
||||
jobs: List[JobItem] = Field(..., description="Job status List")
|
||||
jobs: list[JobItem] = Field(..., description="Job status List")
|
||||
|
||||
|
||||
class Rodin3DDownloadRequest(BaseModel):
|
||||
task_uuid: str = Field(..., description="Task str")
|
||||
|
||||
|
||||
class RodinResourceItem(BaseModel):
|
||||
url: str = Field(..., description="Download Url")
|
||||
name: str = Field(..., description="File name with ext")
|
||||
|
||||
|
||||
class Rodin3DDownloadResponse(BaseModel):
|
||||
list: List[RodinResourceItem] = Field(..., description="Source List")
|
||||
items: list[RodinResourceItem] = Field(..., alias="list", description="Source List")
|
||||
|
||||
@ -9,8 +9,11 @@ from comfy_api_nodes.apis.anthropic import (
|
||||
AnthropicMessage,
|
||||
AnthropicMessagesRequest,
|
||||
AnthropicMessagesResponse,
|
||||
AnthropicOutputConfig,
|
||||
AnthropicResponseTextBlock,
|
||||
AnthropicRole,
|
||||
AnthropicTextContent,
|
||||
AnthropicThinkingConfig,
|
||||
)
|
||||
from comfy_api_nodes.util import (
|
||||
ApiEndpoint,
|
||||
@ -32,15 +35,29 @@ CLAUDE_MODELS: dict[str, str] = {
|
||||
"Haiku 4.5": "claude-haiku-4-5-20251001",
|
||||
}
|
||||
|
||||
_THINKING_UNSUPPORTED = {"Haiku 4.5"}
|
||||
# Models that use the newer "adaptive" thinking mode (Opus 4.7 requires it; older models keep the explicit budget API).
|
||||
# Anthropic decides the actual budget when adaptive is used, based on the `output_config.effort` hint.
|
||||
_ADAPTIVE_THINKING_MODELS = {"Opus 4.7", "Opus 4.6", "Sonnet 4.6"}
|
||||
|
||||
def _claude_model_inputs():
|
||||
return [
|
||||
# Budget mode (Sonnet 4.5): effort -> reasoning budget in tokens. Must be < max_tokens.
|
||||
# Sized so even the "high" budget fits comfortably under the default max_tokens=32768.
|
||||
_REASONING_BUDGET: dict[str, int] = {
|
||||
"low": 2048,
|
||||
"medium": 8192,
|
||||
"high": 16384,
|
||||
}
|
||||
_REASONING_EFFORTS = ["off", "low", "medium", "high"]
|
||||
|
||||
|
||||
def _claude_model_inputs(model_label: str):
|
||||
inputs: list = [
|
||||
IO.Int.Input(
|
||||
"max_tokens",
|
||||
default=16000,
|
||||
min=32,
|
||||
max=32000,
|
||||
tooltip="Maximum number of tokens to generate before stopping.",
|
||||
default=32768,
|
||||
min=4096,
|
||||
max=64000,
|
||||
tooltip="Maximum number of tokens to generate (includes reasoning tokens when enabled).",
|
||||
advanced=True,
|
||||
),
|
||||
IO.Float.Input(
|
||||
@ -49,10 +66,24 @@ def _claude_model_inputs():
|
||||
min=0.0,
|
||||
max=1.0,
|
||||
step=0.01,
|
||||
tooltip="Controls randomness. 0.0 is deterministic, 1.0 is most random. Ignored for Opus 4.7.",
|
||||
tooltip=(
|
||||
"Controls randomness. 0.0 is deterministic, 1.0 is most random. "
|
||||
"Ignored for Opus 4.7 and any model when reasoning_effort is set."
|
||||
),
|
||||
advanced=True,
|
||||
),
|
||||
]
|
||||
if model_label not in _THINKING_UNSUPPORTED:
|
||||
inputs.append(
|
||||
IO.Combo.Input(
|
||||
"reasoning_effort",
|
||||
options=_REASONING_EFFORTS,
|
||||
default="off",
|
||||
tooltip="Extended thinking effort. 'off' disables reasoning.",
|
||||
advanced=True,
|
||||
)
|
||||
)
|
||||
return inputs
|
||||
|
||||
|
||||
def _model_price_per_million(model: str) -> tuple[float, float] | None:
|
||||
@ -95,7 +126,11 @@ def calculate_tokens_price(response: AnthropicMessagesResponse) -> float | None:
|
||||
def _get_text_from_response(response: AnthropicMessagesResponse) -> str:
|
||||
if not response.content:
|
||||
return ""
|
||||
return "\n".join(block.text for block in response.content if block.text)
|
||||
# Thinking blocks are silently dropped — we never want reasoning in the output.
|
||||
return "\n".join(
|
||||
block.text for block in response.content
|
||||
if isinstance(block, AnthropicResponseTextBlock) and block.text
|
||||
)
|
||||
|
||||
|
||||
async def _build_image_content_blocks(
|
||||
@ -133,7 +168,10 @@ class ClaudeNode(IO.ComfyNode):
|
||||
),
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[IO.DynamicCombo.Option(label, _claude_model_inputs()) for label in CLAUDE_MODELS],
|
||||
options=[
|
||||
IO.DynamicCombo.Option(label, _claude_model_inputs(label))
|
||||
for label in CLAUDE_MODELS
|
||||
],
|
||||
tooltip="The Claude model used to generate the response.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
@ -207,8 +245,29 @@ class ClaudeNode(IO.ComfyNode):
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=True, min_length=1)
|
||||
model_label = model["model"]
|
||||
max_tokens = model["max_tokens"]
|
||||
temperature = None if model_label == "Opus 4.7" else model["temperature"]
|
||||
max_tokens = model.get("max_tokens", 32768)
|
||||
reasoning_effort = model.get("reasoning_effort", "off")
|
||||
thinking_enabled = reasoning_effort not in ("off", None) and model_label not in _THINKING_UNSUPPORTED
|
||||
|
||||
# Anthropic requires temperature to be unset (defaults to 1.0) when thinking is enabled.
|
||||
# Opus 4.7 also rejects user-supplied temperature.
|
||||
if thinking_enabled or model_label == "Opus 4.7":
|
||||
temperature = None
|
||||
else:
|
||||
temperature = model.get("temperature", 1.0)
|
||||
|
||||
thinking_cfg: AnthropicThinkingConfig | None = None
|
||||
output_cfg: AnthropicOutputConfig | None = None
|
||||
if thinking_enabled:
|
||||
if model_label in _ADAPTIVE_THINKING_MODELS:
|
||||
# Adaptive mode - Anthropic chooses the budget based on effort hint
|
||||
thinking_cfg = AnthropicThinkingConfig(type="adaptive")
|
||||
output_cfg = AnthropicOutputConfig(effort=reasoning_effort)
|
||||
else:
|
||||
# Budget mode (Sonnet 4.5). Leave at least 1024 tokens for the actual response
|
||||
budget = _REASONING_BUDGET[reasoning_effort]
|
||||
budget = min(budget, max(1024, max_tokens - 1024))
|
||||
thinking_cfg = AnthropicThinkingConfig(type="enabled", budget_tokens=budget)
|
||||
|
||||
image_tensors: list[Input.Image] = [t for t in (images or {}).values() if t is not None]
|
||||
if sum(get_number_of_images(t) for t in image_tensors) > CLAUDE_MAX_IMAGES:
|
||||
@ -229,6 +288,8 @@ class ClaudeNode(IO.ComfyNode):
|
||||
messages=[AnthropicMessage(role=AnthropicRole.user, content=content)],
|
||||
system=system_prompt or None,
|
||||
temperature=temperature,
|
||||
thinking=thinking_cfg,
|
||||
output_config=output_cfg,
|
||||
),
|
||||
price_extractor=calculate_tokens_price,
|
||||
)
|
||||
|
||||
@ -43,15 +43,16 @@ from comfy_api_nodes.util import (
|
||||
ApiEndpoint,
|
||||
download_url_to_image_tensor,
|
||||
download_url_to_video_output,
|
||||
downscale_video_to_max_pixels,
|
||||
get_number_of_images,
|
||||
image_tensor_pair_to_batch,
|
||||
poll_op,
|
||||
resize_video_to_pixel_budget,
|
||||
sync_op,
|
||||
upload_audio_to_comfyapi,
|
||||
upload_image_to_comfyapi,
|
||||
upload_images_to_comfyapi,
|
||||
upload_video_to_comfyapi,
|
||||
upscale_video_to_min_pixels,
|
||||
validate_image_aspect_ratio,
|
||||
validate_image_dimensions,
|
||||
validate_string,
|
||||
@ -110,12 +111,13 @@ def _validate_ref_video_pixels(video: Input.Video, model_id: str, resolution: st
|
||||
max_px = limits.get("max")
|
||||
if min_px and pixels < min_px:
|
||||
raise ValueError(
|
||||
f"Reference video {index} is too small: {w}x{h} = {pixels:,}px. " f"Minimum is {min_px:,}px for this model."
|
||||
f"Reference video {index} is too small: {w}x{h} = {pixels:,} total pixels. "
|
||||
f"Minimum for this model is {min_px:,} total pixels."
|
||||
)
|
||||
if max_px and pixels > max_px:
|
||||
raise ValueError(
|
||||
f"Reference video {index} is too large: {w}x{h} = {pixels:,}px. "
|
||||
f"Maximum is {max_px:,}px for this model. Try downscaling the video."
|
||||
f"Reference video {index} is too large: {w}x{h} = {pixels:,} total pixels. "
|
||||
f"Maximum for this model is {max_px:,} total pixels. Try downscaling the video."
|
||||
)
|
||||
|
||||
|
||||
@ -1676,14 +1678,14 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
|
||||
"first_frame_asset_id",
|
||||
default="",
|
||||
tooltip="Seedance asset_id to use as the first frame. "
|
||||
"Mutually exclusive with the first_frame image input.",
|
||||
"Mutually exclusive with the first_frame image input.",
|
||||
optional=True,
|
||||
),
|
||||
IO.String.Input(
|
||||
"last_frame_asset_id",
|
||||
default="",
|
||||
tooltip="Seedance asset_id to use as the last frame. "
|
||||
"Mutually exclusive with the last_frame image input.",
|
||||
"Mutually exclusive with the last_frame image input.",
|
||||
optional=True,
|
||||
),
|
||||
IO.Int.Input(
|
||||
@ -1865,11 +1867,20 @@ def _seedance2_reference_inputs(resolutions: list[str], default_ratio: str = "16
|
||||
IO.Boolean.Input(
|
||||
"auto_downscale",
|
||||
default=False,
|
||||
advanced=True,
|
||||
optional=True,
|
||||
tooltip="Automatically downscale reference videos that exceed the model's pixel budget "
|
||||
"for the selected resolution. Aspect ratio is preserved; videos already within limits are untouched.",
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"auto_upscale",
|
||||
default=False,
|
||||
advanced=True,
|
||||
optional=True,
|
||||
tooltip="Automatically upscale reference videos that are below the model's minimum pixel count "
|
||||
"for the selected resolution. Aspect ratio is preserved; videos already meeting the minimum are "
|
||||
"untouched. Note: upscaling a low-resolution source does not add real detail and may produce "
|
||||
"lower-quality generations.",
|
||||
),
|
||||
IO.Autogrow.Input(
|
||||
"reference_assets",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
@ -2030,7 +2041,13 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
|
||||
max_px = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id, {}).get(model["resolution"], {}).get("max")
|
||||
if max_px:
|
||||
for key in reference_videos:
|
||||
reference_videos[key] = resize_video_to_pixel_budget(reference_videos[key], max_px)
|
||||
reference_videos[key] = downscale_video_to_max_pixels(reference_videos[key], max_px)
|
||||
|
||||
if model.get("auto_upscale") and reference_videos:
|
||||
min_px = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id, {}).get(model["resolution"], {}).get("min")
|
||||
if min_px:
|
||||
for key in reference_videos:
|
||||
reference_videos[key] = upscale_video_to_min_pixels(reference_videos[key], min_px)
|
||||
|
||||
total_video_duration = 0.0
|
||||
for i, key in enumerate(reference_videos, 1):
|
||||
|
||||
374
comfy_api_nodes/nodes_openrouter.py
Normal file
374
comfy_api_nodes/nodes_openrouter.py
Normal file
@ -0,0 +1,374 @@
|
||||
"""API Nodes for OpenRouter LLM chat completions."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal
|
||||
|
||||
from typing_extensions import override
|
||||
|
||||
from comfy_api.latest import IO, ComfyExtension, Input
|
||||
from comfy_api_nodes.apis.openrouter import (
|
||||
OpenRouterChatRequest,
|
||||
OpenRouterChatResponse,
|
||||
OpenRouterContentBlock,
|
||||
OpenRouterImageContent,
|
||||
OpenRouterImageUrl,
|
||||
OpenRouterMessage,
|
||||
OpenRouterReasoningConfig,
|
||||
OpenRouterTextContent,
|
||||
OpenRouterVideoContent,
|
||||
OpenRouterVideoUrl,
|
||||
OpenRouterWebSearchOptions,
|
||||
)
|
||||
from comfy_api_nodes.util import (
|
||||
ApiEndpoint,
|
||||
get_number_of_images,
|
||||
sync_op,
|
||||
upload_images_to_comfyapi,
|
||||
upload_video_to_comfyapi,
|
||||
validate_string,
|
||||
)
|
||||
|
||||
OPENROUTER_CHAT_ENDPOINT = "/proxy/openrouter/api/v1/chat/completions"
|
||||
|
||||
|
||||
Profile = Literal["standard", "reasoning", "frontier_reasoning", "perplexity", "perplexity_reasoning"]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _ModelSpec:
|
||||
slug: str # exact OpenRouter model id
|
||||
profile: Profile
|
||||
price_in: float # USD per token (prompt)
|
||||
price_out: float # USD per token (completion)
|
||||
max_images: int = 0 # 0 = no image input; otherwise max URL-passed images supported
|
||||
max_videos: int = 0 # 0 = no video input; otherwise max URL-passed videos supported
|
||||
|
||||
|
||||
MODELS: list[_ModelSpec] = [
|
||||
_ModelSpec("anthropic/claude-opus-4.7", "frontier_reasoning", 0.000005, 0.000025, max_images=20),
|
||||
_ModelSpec("openai/gpt-5.5-pro", "frontier_reasoning", 0.00003, 0.00018, max_images=20),
|
||||
_ModelSpec("openai/gpt-5.5", "frontier_reasoning", 0.000005, 0.00003, max_images=20),
|
||||
_ModelSpec("google/gemini-3.5-flash", "reasoning", 0.0000015, 0.000009, max_images=20, max_videos=4),
|
||||
_ModelSpec("x-ai/grok-4.20", "reasoning", 0.00000125, 0.0000025, max_images=20),
|
||||
_ModelSpec("x-ai/grok-4.3", "reasoning", 0.00000125, 0.0000025, max_images=20),
|
||||
_ModelSpec("deepseek/deepseek-v4-pro", "reasoning", 0.000000435, 0.00000087),
|
||||
_ModelSpec("deepseek/deepseek-v4-flash", "reasoning", 0.000000112, 0.000000224),
|
||||
_ModelSpec("deepseek/deepseek-v3.2", "reasoning", 0.000000252, 0.000000378),
|
||||
_ModelSpec("qwen/qwen3.6-max-preview", "reasoning", 0.00000104, 0.00000624),
|
||||
_ModelSpec("qwen/qwen3.6-plus", "reasoning", 0.000000325, 0.00000195, max_images=10, max_videos=4),
|
||||
_ModelSpec("qwen/qwen3.6-flash", "reasoning", 0.0000001875, 0.000001125, max_images=10, max_videos=4),
|
||||
_ModelSpec("mistralai/mistral-large-2512", "standard", 0.0000005, 0.0000015, max_images=8),
|
||||
_ModelSpec("mistralai/mistral-medium-3-5", "reasoning", 0.0000015, 0.0000075, max_images=8),
|
||||
_ModelSpec("z-ai/glm-4.6", "reasoning", 0.00000043, 0.00000174),
|
||||
_ModelSpec("z-ai/glm-5", "reasoning", 0.0000006, 0.00000192),
|
||||
_ModelSpec("moonshotai/kimi-k2.6", "reasoning", 0.00000073, 0.00000349, max_images=10),
|
||||
_ModelSpec("moonshotai/kimi-k2-thinking", "reasoning", 0.0000006, 0.0000025),
|
||||
_ModelSpec("perplexity/sonar-pro", "perplexity", 0.000003, 0.000015),
|
||||
_ModelSpec("perplexity/sonar-reasoning-pro", "perplexity_reasoning", 0.000002, 0.000008),
|
||||
_ModelSpec("perplexity/sonar-deep-research", "perplexity_reasoning", 0.000002, 0.000008),
|
||||
]
|
||||
|
||||
_MODELS_BY_SLUG: dict[str, _ModelSpec] = {m.slug: m for m in MODELS}
|
||||
_REASONING_EFFORTS = ["off", "low", "medium", "high"]
|
||||
_SEARCH_CONTEXT_SIZES = ["low", "medium", "high"]
|
||||
|
||||
|
||||
def _reasoning_extra_inputs() -> list:
|
||||
return [
|
||||
IO.Combo.Input(
|
||||
"reasoning_effort",
|
||||
options=_REASONING_EFFORTS,
|
||||
default="off",
|
||||
tooltip="Reasoning effort. 'off' disables reasoning entirely.",
|
||||
advanced=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def _perplexity_extra_inputs() -> list:
|
||||
return [
|
||||
IO.Combo.Input(
|
||||
"search_context_size",
|
||||
options=_SEARCH_CONTEXT_SIZES,
|
||||
default="medium",
|
||||
tooltip="How much web search context to retrieve. Larger = more grounded but slower/pricier.",
|
||||
advanced=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def _profile_inputs(profile: Profile) -> list:
|
||||
if profile == "standard":
|
||||
return []
|
||||
if profile in ("reasoning", "frontier_reasoning"):
|
||||
return _reasoning_extra_inputs()
|
||||
if profile == "perplexity":
|
||||
return _perplexity_extra_inputs()
|
||||
if profile == "perplexity_reasoning":
|
||||
return _perplexity_extra_inputs() + _reasoning_extra_inputs()
|
||||
raise ValueError(f"Unknown profile: {profile}")
|
||||
|
||||
|
||||
def _media_inputs(spec: _ModelSpec) -> list:
|
||||
extras: list = []
|
||||
if spec.max_images > 0:
|
||||
extras.append(
|
||||
IO.Autogrow.Input(
|
||||
"images",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.Image.Input("image"),
|
||||
names=[f"image_{i}" for i in range(1, spec.max_images + 1)],
|
||||
min=0,
|
||||
),
|
||||
tooltip=f"Optional reference image(s) — up to {spec.max_images}. Sent as URLs.",
|
||||
)
|
||||
)
|
||||
if spec.max_videos > 0:
|
||||
extras.append(
|
||||
IO.Autogrow.Input(
|
||||
"videos",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.Video.Input("video"),
|
||||
names=[f"video_{i}" for i in range(1, spec.max_videos + 1)],
|
||||
min=0,
|
||||
),
|
||||
tooltip=f"Optional reference video(s) — up to {spec.max_videos}. Sent as URLs.",
|
||||
)
|
||||
)
|
||||
return extras
|
||||
|
||||
|
||||
def _inputs_for_model(spec: _ModelSpec) -> list:
|
||||
return _profile_inputs(spec.profile) + _media_inputs(spec)
|
||||
|
||||
|
||||
def _build_model_options() -> list[IO.DynamicCombo.Option]:
|
||||
return [IO.DynamicCombo.Option(spec.slug, _inputs_for_model(spec)) for spec in MODELS]
|
||||
|
||||
|
||||
def _calculate_price(response: OpenRouterChatResponse) -> float | None:
|
||||
if response.usage and response.usage.cost is not None:
|
||||
return float(response.usage.cost)
|
||||
return None
|
||||
|
||||
|
||||
def _price_badge_jsonata() -> str:
|
||||
rates_pairs = []
|
||||
for spec in MODELS:
|
||||
prompt_per_1k = spec.price_in * 1000
|
||||
completion_per_1k = spec.price_out * 1000
|
||||
rates_pairs.append(f' "{spec.slug}": [{prompt_per_1k:.8g}, {completion_per_1k:.8g}]')
|
||||
rates_block = ",\n".join(rates_pairs)
|
||||
return (
|
||||
"(\n"
|
||||
" $rates := {\n"
|
||||
f"{rates_block}\n"
|
||||
" };\n"
|
||||
" $r := $lookup($rates, widgets.model);\n"
|
||||
" $r ? {\n"
|
||||
' "type": "list_usd",\n'
|
||||
' "usd": $r,\n'
|
||||
' "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }\n'
|
||||
' } : {"type": "text", "text": "Token-based"}\n'
|
||||
")"
|
||||
)
|
||||
|
||||
|
||||
async def _build_image_blocks(
|
||||
cls: type[IO.ComfyNode], spec: _ModelSpec, images: list[Input.Image]
|
||||
) -> list[OpenRouterImageContent]:
|
||||
urls = await upload_images_to_comfyapi(
|
||||
cls,
|
||||
images,
|
||||
max_images=spec.max_images,
|
||||
total_pixels=2048 * 2048,
|
||||
mime_type="image/png",
|
||||
wait_label="Uploading reference images",
|
||||
)
|
||||
return [OpenRouterImageContent(image_url=OpenRouterImageUrl(url=url)) for url in urls]
|
||||
|
||||
|
||||
async def _build_video_blocks(cls: type[IO.ComfyNode], videos: list[Input.Video]) -> list[OpenRouterVideoContent]:
|
||||
blocks: list[OpenRouterVideoContent] = []
|
||||
total = len(videos)
|
||||
for idx, video in enumerate(videos):
|
||||
label = "Uploading reference video"
|
||||
if total > 1:
|
||||
label = f"{label} ({idx + 1}/{total})"
|
||||
url = await upload_video_to_comfyapi(cls, video, wait_label=label)
|
||||
blocks.append(OpenRouterVideoContent(video_url=OpenRouterVideoUrl(url=url)))
|
||||
return blocks
|
||||
|
||||
|
||||
def _user_message(prompt: str, media_blocks: list[OpenRouterContentBlock]) -> OpenRouterMessage:
|
||||
if not media_blocks:
|
||||
return OpenRouterMessage(role="user", content=prompt)
|
||||
blocks: list[OpenRouterContentBlock] = list(media_blocks)
|
||||
blocks.append(OpenRouterTextContent(text=prompt))
|
||||
return OpenRouterMessage(role="user", content=blocks)
|
||||
|
||||
|
||||
def _build_messages(
|
||||
system_prompt: str, prompt: str, media_blocks: list[OpenRouterContentBlock]
|
||||
) -> list[OpenRouterMessage]:
|
||||
messages: list[OpenRouterMessage] = []
|
||||
if system_prompt:
|
||||
messages.append(OpenRouterMessage(role="system", content=system_prompt))
|
||||
messages.append(_user_message(prompt, media_blocks))
|
||||
return messages
|
||||
|
||||
|
||||
def _build_request(
|
||||
slug: str,
|
||||
system_prompt: str,
|
||||
prompt: str,
|
||||
media_blocks: list[OpenRouterContentBlock],
|
||||
*,
|
||||
seed: int,
|
||||
reasoning_effort: str | None,
|
||||
search_context_size: str | None,
|
||||
) -> OpenRouterChatRequest:
|
||||
reasoning_cfg: OpenRouterReasoningConfig | None = None
|
||||
if reasoning_effort and reasoning_effort != "off":
|
||||
# exclude=True asks providers to reason internally but not return the trace
|
||||
reasoning_cfg = OpenRouterReasoningConfig(effort=reasoning_effort, exclude=True)
|
||||
web_search_cfg: OpenRouterWebSearchOptions | None = None
|
||||
if search_context_size:
|
||||
web_search_cfg = OpenRouterWebSearchOptions(search_context_size=search_context_size)
|
||||
return OpenRouterChatRequest(
|
||||
model=slug,
|
||||
messages=_build_messages(system_prompt, prompt, media_blocks),
|
||||
seed=seed if seed > 0 else None,
|
||||
reasoning=reasoning_cfg,
|
||||
web_search_options=web_search_cfg,
|
||||
)
|
||||
|
||||
|
||||
def _extract_text(response: OpenRouterChatResponse) -> str:
|
||||
if response.error:
|
||||
code = response.error.code if response.error.code is not None else "unknown"
|
||||
raise ValueError(f"OpenRouter error ({code}): {response.error.message or 'no message'}")
|
||||
if not response.choices:
|
||||
raise ValueError("Empty response from OpenRouter (no choices).")
|
||||
message = response.choices[0].message
|
||||
if not message:
|
||||
raise ValueError("Empty response from OpenRouter (no message).")
|
||||
if message.refusal:
|
||||
raise ValueError(f"Model refused to respond: {message.refusal}")
|
||||
return message.content or ""
|
||||
|
||||
|
||||
class OpenRouterLLMNode(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="OpenRouterLLMNode",
|
||||
display_name="OpenRouter LLM",
|
||||
category="api node/text/OpenRouter",
|
||||
essentials_category="Text Generation",
|
||||
description=(
|
||||
"Generate text responses through OpenRouter. Routes to a curated set of popular "
|
||||
"models from xAI, DeepSeek, Qwen, Mistral, Z.AI (GLM), Moonshot (Kimi), and "
|
||||
"Perplexity Sonar."
|
||||
),
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Text input to the model.",
|
||||
),
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=_build_model_options(),
|
||||
tooltip="The OpenRouter model used to generate the response.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=2147483647,
|
||||
control_after_generate=True,
|
||||
tooltip="Seed for sampling. Set to 0 to omit. Most models treat this as a hint only.",
|
||||
),
|
||||
IO.String.Input(
|
||||
"system_prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
optional=True,
|
||||
advanced=True,
|
||||
tooltip="Foundational instructions that dictate the model's behavior.",
|
||||
),
|
||||
],
|
||||
outputs=[IO.String.Output()],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model"]),
|
||||
expr=_price_badge_jsonata(),
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
prompt: str,
|
||||
model: dict,
|
||||
seed: int,
|
||||
system_prompt: str = "",
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=True, min_length=1)
|
||||
slug: str = model["model"]
|
||||
spec = _MODELS_BY_SLUG.get(slug)
|
||||
if spec is None:
|
||||
raise ValueError(f"Unknown OpenRouter model: {slug}")
|
||||
|
||||
reasoning_effort: str | None = model.get("reasoning_effort")
|
||||
search_context_size: str | None = model.get("search_context_size")
|
||||
|
||||
image_tensors: list[Input.Image] = [t for t in (model.get("images") or {}).values() if t is not None]
|
||||
if image_tensors and sum(get_number_of_images(t) for t in image_tensors) > spec.max_images:
|
||||
raise ValueError(f"Up to {spec.max_images} images are supported for {slug}.")
|
||||
video_inputs: list[Input.Video] = [v for v in (model.get("videos") or {}).values() if v is not None]
|
||||
if video_inputs and len(video_inputs) > spec.max_videos:
|
||||
raise ValueError(f"Up to {spec.max_videos} videos are supported for {slug}.")
|
||||
|
||||
media_blocks: list[OpenRouterContentBlock] = []
|
||||
if image_tensors:
|
||||
media_blocks.extend(await _build_image_blocks(cls, spec, image_tensors))
|
||||
if video_inputs:
|
||||
media_blocks.extend(await _build_video_blocks(cls, video_inputs))
|
||||
|
||||
request = _build_request(
|
||||
slug,
|
||||
system_prompt,
|
||||
prompt,
|
||||
media_blocks,
|
||||
seed=seed,
|
||||
reasoning_effort=reasoning_effort,
|
||||
search_context_size=search_context_size,
|
||||
)
|
||||
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path=OPENROUTER_CHAT_ENDPOINT, method="POST"),
|
||||
response_model=OpenRouterChatResponse,
|
||||
data=request,
|
||||
price_extractor=_calculate_price,
|
||||
)
|
||||
return IO.NodeOutput(_extract_text(response))
|
||||
|
||||
|
||||
class OpenRouterExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
||||
return [OpenRouterLLMNode]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> OpenRouterExtension:
|
||||
return OpenRouterExtension()
|
||||
@ -5,32 +5,37 @@ Rodin API docs: https://developer.hyper3d.ai/
|
||||
|
||||
"""
|
||||
|
||||
from inspect import cleandoc
|
||||
import folder_paths as comfy_paths
|
||||
import os
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
from inspect import cleandoc
|
||||
from io import BytesIO
|
||||
from typing_extensions import override
|
||||
from typing import Any
|
||||
|
||||
import aiohttp
|
||||
from PIL import Image
|
||||
from typing_extensions import override
|
||||
|
||||
import folder_paths as comfy_paths
|
||||
from comfy_api.latest import IO, ComfyExtension, Types
|
||||
from comfy_api_nodes.apis.rodin import (
|
||||
Rodin3DGenerateRequest,
|
||||
Rodin3DGenerateResponse,
|
||||
JobStatus,
|
||||
Rodin3DCheckStatusRequest,
|
||||
Rodin3DCheckStatusResponse,
|
||||
Rodin3DDownloadRequest,
|
||||
Rodin3DDownloadResponse,
|
||||
JobStatus,
|
||||
Rodin3DGen25Request,
|
||||
Rodin3DGenerateRequest,
|
||||
Rodin3DGenerateResponse,
|
||||
)
|
||||
from comfy_api_nodes.util import (
|
||||
sync_op,
|
||||
poll_op,
|
||||
ApiEndpoint,
|
||||
download_url_to_bytesio,
|
||||
download_url_to_file_3d,
|
||||
poll_op,
|
||||
sync_op,
|
||||
validate_string,
|
||||
)
|
||||
from comfy_api.latest import ComfyExtension, IO, Types
|
||||
|
||||
|
||||
COMMON_PARAMETERS = [
|
||||
IO.Int.Input(
|
||||
@ -51,40 +56,30 @@ COMMON_PARAMETERS = [
|
||||
]
|
||||
|
||||
|
||||
def get_quality_mode(poly_count):
|
||||
polycount = poly_count.split("-")
|
||||
poly = polycount[1]
|
||||
count = polycount[0]
|
||||
if poly == "Triangle":
|
||||
mesh_mode = "Raw"
|
||||
elif poly == "Quad":
|
||||
mesh_mode = "Quad"
|
||||
else:
|
||||
mesh_mode = "Quad"
|
||||
|
||||
if count == "4K":
|
||||
quality_override = 4000
|
||||
elif count == "8K":
|
||||
quality_override = 8000
|
||||
elif count == "18K":
|
||||
quality_override = 18000
|
||||
elif count == "50K":
|
||||
quality_override = 50000
|
||||
elif count == "2K":
|
||||
quality_override = 2000
|
||||
elif count == "20K":
|
||||
quality_override = 20000
|
||||
elif count == "150K":
|
||||
quality_override = 150000
|
||||
elif count == "500K":
|
||||
quality_override = 500000
|
||||
else:
|
||||
quality_override = 18000
|
||||
|
||||
return mesh_mode, quality_override
|
||||
_QUALITY_MESH_OPTIONS: dict[str, tuple[str, int]] = {
|
||||
"4K-Quad": ("Quad", 4000),
|
||||
"8K-Quad": ("Quad", 8000),
|
||||
"18K-Quad": ("Quad", 18000),
|
||||
"50K-Quad": ("Quad", 50000),
|
||||
"200K-Quad": ("Quad", 200000),
|
||||
"2K-Triangle": ("Raw", 2000),
|
||||
"20K-Triangle": ("Raw", 20000),
|
||||
"150K-Triangle": ("Raw", 150000),
|
||||
"200K-Triangle": ("Raw", 200000),
|
||||
"500K-Triangle": ("Raw", 500000),
|
||||
"1M-Triangle": ("Raw", 1000000),
|
||||
}
|
||||
|
||||
|
||||
def tensor_to_filelike(tensor, max_pixels: int = 2048*2048):
|
||||
def get_quality_mode(poly_count: str) -> tuple[str, int]:
|
||||
"""Map a polygon-count preset like '18K-Quad' to (mesh_mode, quality_override).
|
||||
|
||||
Falls back to ('Quad', 18000) for unknown labels; legacy parity.
|
||||
"""
|
||||
return _QUALITY_MESH_OPTIONS.get(poly_count, ("Quad", 18000))
|
||||
|
||||
|
||||
def tensor_to_filelike(tensor, max_pixels: int = 2048 * 2048):
|
||||
"""
|
||||
Converts a PyTorch tensor to a file-like object.
|
||||
|
||||
@ -96,8 +91,8 @@ def tensor_to_filelike(tensor, max_pixels: int = 2048*2048):
|
||||
- io.BytesIO: A file-like object containing the image data.
|
||||
"""
|
||||
array = tensor.cpu().numpy()
|
||||
array = (array * 255).astype('uint8')
|
||||
image = Image.fromarray(array, 'RGB')
|
||||
array = (array * 255).astype("uint8")
|
||||
image = Image.fromarray(array, "RGB")
|
||||
|
||||
original_width, original_height = image.size
|
||||
original_pixels = original_width * original_height
|
||||
@ -112,7 +107,7 @@ def tensor_to_filelike(tensor, max_pixels: int = 2048*2048):
|
||||
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
||||
|
||||
img_byte_arr = BytesIO()
|
||||
image.save(img_byte_arr, format='PNG') # PNG is used for lossless compression
|
||||
image.save(img_byte_arr, format="PNG") # PNG is used for lossless compression
|
||||
img_byte_arr.seek(0)
|
||||
return img_byte_arr
|
||||
|
||||
@ -145,11 +140,9 @@ async def create_generate_task(
|
||||
TAPose=ta_pose,
|
||||
),
|
||||
files=[
|
||||
(
|
||||
"images",
|
||||
open(image, "rb") if isinstance(image, str) else tensor_to_filelike(image)
|
||||
)
|
||||
for image in images if image is not None
|
||||
("images", open(image, "rb") if isinstance(image, str) else tensor_to_filelike(image))
|
||||
for image in images
|
||||
if image is not None
|
||||
],
|
||||
content_type="multipart/form-data",
|
||||
)
|
||||
@ -177,6 +170,7 @@ def check_rodin_status(response: Rodin3DCheckStatusResponse) -> str:
|
||||
return "DONE"
|
||||
return "Generating"
|
||||
|
||||
|
||||
def extract_progress(response: Rodin3DCheckStatusResponse) -> int | None:
|
||||
if not response.jobs:
|
||||
return None
|
||||
@ -214,7 +208,7 @@ async def download_files(url_list, task_uuid: str) -> tuple[str | None, Types.Fi
|
||||
model_file_path = None
|
||||
file_3d = None
|
||||
|
||||
for i in url_list.list:
|
||||
for i in url_list.items:
|
||||
file_path = os.path.join(save_path, i.name)
|
||||
if i.name.lower().endswith(".glb"):
|
||||
model_file_path = os.path.join(result_folder_name, i.name)
|
||||
@ -489,7 +483,16 @@ class Rodin3D_Gen2(IO.ComfyNode):
|
||||
IO.Combo.Input("Material_Type", options=["PBR", "Shaded"], default="PBR", optional=True),
|
||||
IO.Combo.Input(
|
||||
"Polygon_count",
|
||||
options=["4K-Quad", "8K-Quad", "18K-Quad", "50K-Quad", "2K-Triangle", "20K-Triangle", "150K-Triangle", "500K-Triangle"],
|
||||
options=[
|
||||
"4K-Quad",
|
||||
"8K-Quad",
|
||||
"18K-Quad",
|
||||
"50K-Quad",
|
||||
"2K-Triangle",
|
||||
"20K-Triangle",
|
||||
"150K-Triangle",
|
||||
"500K-Triangle",
|
||||
],
|
||||
default="500K-Triangle",
|
||||
optional=True,
|
||||
),
|
||||
@ -542,6 +545,566 @@ class Rodin3D_Gen2(IO.ComfyNode):
|
||||
return IO.NodeOutput(model_path, file_3d)
|
||||
|
||||
|
||||
def _rodin_multipart_parser(data: dict[str, Any]) -> aiohttp.FormData:
|
||||
"""Convert a Rodin request dict to an aiohttp form, fixing bool/list serialization.
|
||||
|
||||
Booleans --> "true"/"false". Lists --> one field per element.
|
||||
"""
|
||||
form = aiohttp.FormData(default_to_multipart=True)
|
||||
for key, value in data.items():
|
||||
if value is None:
|
||||
continue
|
||||
if isinstance(value, bool):
|
||||
form.add_field(key, "true" if value else "false")
|
||||
elif isinstance(value, list):
|
||||
for item in value:
|
||||
form.add_field(key, str(item))
|
||||
elif isinstance(value, (bytes, bytearray)):
|
||||
form.add_field(key, value)
|
||||
else:
|
||||
form.add_field(key, str(value))
|
||||
return form
|
||||
|
||||
|
||||
async def _create_gen25_task(
|
||||
cls: type[IO.ComfyNode],
|
||||
request: Rodin3DGen25Request,
|
||||
images: list | None,
|
||||
) -> tuple[str, str]:
|
||||
"""Submit a Gen-2.5 generate job; returns (task_uuid, subscription_key)."""
|
||||
|
||||
if images is not None and len(images) > 5:
|
||||
raise ValueError("Rodin Gen-2.5 supports at most 5 input images.")
|
||||
|
||||
files = None
|
||||
if images:
|
||||
files = [
|
||||
(
|
||||
"images",
|
||||
open(image, "rb") if isinstance(image, str) else tensor_to_filelike(image),
|
||||
)
|
||||
for image in images
|
||||
if image is not None
|
||||
]
|
||||
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/rodin/api/v2/rodin", method="POST"),
|
||||
response_model=Rodin3DGenerateResponse,
|
||||
data=request,
|
||||
files=files,
|
||||
content_type="multipart/form-data",
|
||||
multipart_parser=_rodin_multipart_parser,
|
||||
)
|
||||
|
||||
if not response.uuid or not response.jobs or not response.jobs.subscription_key:
|
||||
raise RuntimeError(f"Rodin Gen-2.5 submit failed: message={response.message!r}")
|
||||
return response.uuid, response.jobs.subscription_key
|
||||
|
||||
|
||||
_PREVIEWABLE_3D_EXTS = {".glb", ".obj", ".fbx", ".stl", ".gltf"}
|
||||
|
||||
|
||||
async def _download_gen25_files(
|
||||
download_list: Rodin3DDownloadResponse,
|
||||
task_uuid: str,
|
||||
geometry_file_format: str,
|
||||
) -> Types.File3D | None:
|
||||
"""Download every file in the list; return the File3D matching the chosen format."""
|
||||
|
||||
folder_name = f"Rodin3D_Gen25_{task_uuid}"
|
||||
save_dir = os.path.join(comfy_paths.get_output_directory(), folder_name)
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
|
||||
target_ext = f".{geometry_file_format.lower().lstrip('.')}"
|
||||
file_3d: Types.File3D | None = None
|
||||
|
||||
for item in download_list.items:
|
||||
file_path = os.path.join(save_dir, item.name)
|
||||
ext = os.path.splitext(item.name.lower())[1]
|
||||
# Prefer the file matching the user's chosen format; fall back below.
|
||||
if file_3d is None and ext == target_ext and ext in _PREVIEWABLE_3D_EXTS:
|
||||
file_3d = await download_url_to_file_3d(item.url, target_ext.lstrip("."))
|
||||
with open(file_path, "wb") as f:
|
||||
f.write(file_3d.get_bytes())
|
||||
continue
|
||||
await download_url_to_bytesio(item.url, file_path)
|
||||
|
||||
# If the chosen format wasn't found, surface any model file we did get.
|
||||
if file_3d is None:
|
||||
for item in download_list.items:
|
||||
ext = os.path.splitext(item.name.lower())[1]
|
||||
if ext in _PREVIEWABLE_3D_EXTS:
|
||||
file_3d = await download_url_to_file_3d(item.url, ext.lstrip("."))
|
||||
break
|
||||
return file_3d
|
||||
|
||||
|
||||
_MODE_REGULAR = "Regular"
|
||||
_MODE_FAST = "Fast"
|
||||
_MODE_EXTREME_HIGH = "Extreme-High"
|
||||
|
||||
_REGULAR_POLY_OPTIONS = [
|
||||
"Default",
|
||||
"4K-Quad",
|
||||
"8K-Quad",
|
||||
"18K-Quad",
|
||||
"50K-Quad",
|
||||
"2K-Triangle",
|
||||
"20K-Triangle",
|
||||
"150K-Triangle",
|
||||
"500K-Triangle",
|
||||
"1M-Triangle",
|
||||
]
|
||||
|
||||
_TEXTURE_MODE_OPTIONS = ["Default", "legacy", "extreme-low", "low", "medium", "high"]
|
||||
_GEOMETRY_FORMAT_OPTIONS = ["glb", "fbx", "obj", "stl"]
|
||||
_MATERIAL_OPTIONS = ["PBR", "Shaded", "All", "None"]
|
||||
|
||||
|
||||
def _build_mode_input(name: str = "mode") -> IO.DynamicCombo.Input:
|
||||
return IO.DynamicCombo.Input(
|
||||
name,
|
||||
options=[
|
||||
IO.DynamicCombo.Option(
|
||||
_MODE_REGULAR,
|
||||
[
|
||||
IO.Combo.Input(
|
||||
"tier",
|
||||
options=["Gen-2.5-Low", "Gen-2.5-Medium", "Gen-2.5-High"],
|
||||
default="Gen-2.5-High",
|
||||
tooltip="Quality tier. Higher tiers produce higher-fidelity geometry.",
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"polygon_count",
|
||||
options=_REGULAR_POLY_OPTIONS,
|
||||
default="Default",
|
||||
tooltip="Preset face count. 'Default' uses the server's default for the selected tier.",
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"creative",
|
||||
default=False,
|
||||
tooltip="Creative mode (Medium/High only). Enhances generative robustness.",
|
||||
),
|
||||
],
|
||||
),
|
||||
IO.DynamicCombo.Option(
|
||||
_MODE_FAST,
|
||||
[
|
||||
IO.Combo.Input(
|
||||
"tier",
|
||||
options=[
|
||||
"Gen-2.5-Extreme-Low",
|
||||
"Gen-2.5-Low",
|
||||
"Gen-2.5-Medium",
|
||||
"Gen-2.5-High",
|
||||
],
|
||||
default="Gen-2.5-Low",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"mesh_faces",
|
||||
default=20000,
|
||||
min=1000,
|
||||
max=20000,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
tooltip="Mesh face count (1K-20K in Fast mode).",
|
||||
),
|
||||
],
|
||||
),
|
||||
IO.DynamicCombo.Option(
|
||||
_MODE_EXTREME_HIGH,
|
||||
[
|
||||
IO.Combo.Input("mesh_mode", options=["Raw", "Quad"], default="Raw"),
|
||||
IO.Int.Input(
|
||||
"mesh_faces",
|
||||
default=1000000,
|
||||
min=20000,
|
||||
max=2000000,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
tooltip=(
|
||||
"Mesh face count. Raw mode: 20K-2M. "
|
||||
"Quad mode: keep under 200K (upstream may reject higher values)."
|
||||
),
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"is_micro",
|
||||
default=False,
|
||||
tooltip="Enable micro detail (Extreme-High only).",
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"creative",
|
||||
default=False,
|
||||
tooltip="Creative mode. Enhances generative robustness.",
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
tooltip=(
|
||||
"Generation mode. Regular = balanced. Fast = 1K-20K faces for rapid prototyping. "
|
||||
"Extreme-High = 20K-2M faces with optional micro details."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _build_common_inputs(*, include_image_only: bool) -> list:
|
||||
inputs: list = [
|
||||
IO.Combo.Input("material", options=_MATERIAL_OPTIONS, default="Shaded"),
|
||||
IO.Combo.Input("geometry_file_format", options=_GEOMETRY_FORMAT_OPTIONS, default="glb"),
|
||||
IO.Combo.Input(
|
||||
"texture_mode",
|
||||
options=_TEXTURE_MODE_OPTIONS,
|
||||
default="Default",
|
||||
optional=True,
|
||||
tooltip="Texture quality preset. 'Default' uses the server's default for the selected tier.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=65535,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
control_after_generate=True,
|
||||
optional=True,
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"TAPose", default=False, optional=True, advanced=True, tooltip="T/A pose for human-like models."
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"hd_texture", default=False, optional=True, advanced=True, tooltip="High-quality texture enhancement."
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"texture_delight",
|
||||
default=False,
|
||||
optional=True,
|
||||
advanced=True,
|
||||
tooltip="Remove baked lighting from textures.",
|
||||
),
|
||||
]
|
||||
if include_image_only:
|
||||
inputs.append(
|
||||
IO.Boolean.Input(
|
||||
"use_original_alpha",
|
||||
default=False,
|
||||
optional=True,
|
||||
advanced=True,
|
||||
tooltip="Preserve image transparency.",
|
||||
)
|
||||
)
|
||||
inputs.extend(
|
||||
[
|
||||
IO.Boolean.Input(
|
||||
"addon_highpack",
|
||||
default=False,
|
||||
optional=True,
|
||||
advanced=True,
|
||||
tooltip="HighPack addon: 4K textures and ~16x faces in Quad mode.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"bbox_width",
|
||||
default=0,
|
||||
min=0,
|
||||
max=300,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
optional=True,
|
||||
advanced=True,
|
||||
tooltip="Bounding-box width (Y axis). Set to 0 with the others to skip bbox.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"bbox_height",
|
||||
default=0,
|
||||
min=0,
|
||||
max=300,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
optional=True,
|
||||
advanced=True,
|
||||
tooltip="Bounding-box height (Z axis).",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"bbox_length",
|
||||
default=0,
|
||||
min=0,
|
||||
max=300,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
optional=True,
|
||||
advanced=True,
|
||||
tooltip="Bounding-box length (X axis).",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"height_cm",
|
||||
default=0,
|
||||
min=0,
|
||||
max=10000,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
optional=True,
|
||||
advanced=True,
|
||||
tooltip="Approximate model height in centimeters (0 to skip).",
|
||||
),
|
||||
]
|
||||
)
|
||||
return inputs
|
||||
|
||||
|
||||
_PRICE_EXPR = """
|
||||
(
|
||||
$baseCredits := widgets.mode = "extreme-high" ? 1.0 : 0.5;
|
||||
$addonCredits := widgets.addon_highpack ? 1.0 : 0.0;
|
||||
$total := ($baseCredits * 1.5) + ($addonCredits * 0.8);
|
||||
{"type":"usd","usd": $total}
|
||||
)
|
||||
"""
|
||||
|
||||
|
||||
def _resolve_mode_params(mode_input: dict) -> dict:
|
||||
"""Translate the DynamicCombo `mode` payload into Gen-2.5 request fields.
|
||||
|
||||
Returns a dict with: tier, quality_override, mesh_mode, geometry_instruct_mode, is_micro.
|
||||
Missing keys mean "do not send" (so we don't override server defaults).
|
||||
"""
|
||||
selected = mode_input["mode"]
|
||||
out: dict = {}
|
||||
|
||||
if selected == _MODE_REGULAR:
|
||||
out["tier"] = mode_input["tier"]
|
||||
polygon = mode_input.get("polygon_count", "Default")
|
||||
if polygon != "Default":
|
||||
mesh_mode, faces = get_quality_mode(polygon)
|
||||
out["mesh_mode"] = mesh_mode
|
||||
out["quality_override"] = faces
|
||||
if mode_input.get("creative"):
|
||||
out["geometry_instruct_mode"] = "creative"
|
||||
|
||||
elif selected == _MODE_FAST:
|
||||
out["tier"] = mode_input["tier"]
|
||||
out["mesh_mode"] = "Raw"
|
||||
out["quality_override"] = int(mode_input["mesh_faces"])
|
||||
|
||||
elif selected == _MODE_EXTREME_HIGH:
|
||||
out["tier"] = "Gen-2.5-Extreme-High"
|
||||
out["mesh_mode"] = mode_input["mesh_mode"]
|
||||
out["quality_override"] = int(mode_input["mesh_faces"])
|
||||
if mode_input.get("is_micro"):
|
||||
out["is_micro"] = True
|
||||
if mode_input.get("creative"):
|
||||
out["geometry_instruct_mode"] = "creative"
|
||||
return out
|
||||
|
||||
|
||||
def _build_request(
|
||||
*,
|
||||
mode_input: dict,
|
||||
material: str,
|
||||
geometry_file_format: str,
|
||||
texture_mode: str,
|
||||
seed: int,
|
||||
TAPose: bool,
|
||||
hd_texture: bool,
|
||||
texture_delight: bool,
|
||||
addon_highpack: bool,
|
||||
bbox_width: int,
|
||||
bbox_height: int,
|
||||
bbox_length: int,
|
||||
height_cm: int,
|
||||
prompt: str | None = None,
|
||||
use_original_alpha: bool = False,
|
||||
) -> Rodin3DGen25Request:
|
||||
mode_params = _resolve_mode_params(mode_input)
|
||||
|
||||
bbox = None
|
||||
if bbox_width and bbox_height and bbox_length:
|
||||
bbox = [bbox_width, bbox_height, bbox_length]
|
||||
|
||||
return Rodin3DGen25Request(
|
||||
tier=mode_params["tier"],
|
||||
prompt=prompt or None,
|
||||
seed=seed,
|
||||
material=material,
|
||||
geometry_file_format=geometry_file_format,
|
||||
texture_mode=None if texture_mode == "Default" else texture_mode,
|
||||
mesh_mode=mode_params.get("mesh_mode"),
|
||||
quality_override=mode_params.get("quality_override"),
|
||||
geometry_instruct_mode=mode_params.get("geometry_instruct_mode"),
|
||||
bbox_condition=bbox,
|
||||
height=height_cm or None,
|
||||
TAPose=TAPose or None,
|
||||
hd_texture=hd_texture or None,
|
||||
texture_delight=texture_delight or None,
|
||||
is_micro=mode_params.get("is_micro"),
|
||||
use_original_alpha=use_original_alpha or None,
|
||||
addons=["HighPack"] if addon_highpack else None,
|
||||
)
|
||||
|
||||
|
||||
class Rodin3D_Gen25_Image(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls) -> IO.Schema:
|
||||
return IO.Schema(
|
||||
node_id="Rodin3D_Gen25_Image",
|
||||
display_name="Rodin 3D Gen-2.5 - Image to 3D",
|
||||
category="api node/3d/Rodin",
|
||||
description=(
|
||||
"Generate a 3D model from 1-5 reference images via Rodin Gen-2.5. "
|
||||
"Pick a mode (Fast / Regular / Extreme-High) to tune quality vs. cost."
|
||||
),
|
||||
inputs=[
|
||||
IO.Autogrow.Input(
|
||||
"images",
|
||||
template=IO.Autogrow.TemplatePrefix(IO.Image.Input("image"), prefix="image", min=1, max=5),
|
||||
tooltip="1-5 images. The first image is used for materials when multi-view.",
|
||||
),
|
||||
_build_mode_input(),
|
||||
*_build_common_inputs(include_image_only=True),
|
||||
],
|
||||
outputs=[IO.File3DAny.Output(display_name="model_file")],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["mode", "addon_highpack"]),
|
||||
expr=_PRICE_EXPR,
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
images: IO.Autogrow.Type,
|
||||
mode: dict,
|
||||
material: str,
|
||||
geometry_file_format: str,
|
||||
texture_mode: str,
|
||||
seed: int,
|
||||
TAPose: bool,
|
||||
hd_texture: bool,
|
||||
texture_delight: bool,
|
||||
use_original_alpha: bool,
|
||||
addon_highpack: bool,
|
||||
bbox_width: int,
|
||||
bbox_height: int,
|
||||
bbox_length: int,
|
||||
height_cm: int,
|
||||
) -> IO.NodeOutput:
|
||||
image_tensors = [img for img in images.values() if img is not None]
|
||||
if not image_tensors:
|
||||
raise ValueError("Rodin Gen-2.5 Image-to-3D requires at least one image.")
|
||||
|
||||
# Flatten multi-image tensors into individual frames; the API accepts each as a separate part.
|
||||
flat_images: list = []
|
||||
for tensor in image_tensors:
|
||||
if hasattr(tensor, "shape") and len(tensor.shape) == 4:
|
||||
for i in range(tensor.shape[0]):
|
||||
flat_images.append(tensor[i])
|
||||
else:
|
||||
flat_images.append(tensor)
|
||||
|
||||
if len(flat_images) > 5:
|
||||
raise ValueError(f"Rodin Gen-2.5 accepts at most 5 images; received {len(flat_images)}.")
|
||||
|
||||
request = _build_request(
|
||||
mode_input=mode,
|
||||
material=material,
|
||||
geometry_file_format=geometry_file_format,
|
||||
texture_mode=texture_mode,
|
||||
seed=seed,
|
||||
TAPose=TAPose,
|
||||
hd_texture=hd_texture,
|
||||
texture_delight=texture_delight,
|
||||
addon_highpack=addon_highpack,
|
||||
bbox_width=bbox_width,
|
||||
bbox_height=bbox_height,
|
||||
bbox_length=bbox_length,
|
||||
height_cm=height_cm,
|
||||
prompt=None,
|
||||
use_original_alpha=use_original_alpha,
|
||||
)
|
||||
|
||||
task_uuid, subscription_key = await _create_gen25_task(cls, request, flat_images)
|
||||
await poll_for_task_status(subscription_key, cls)
|
||||
download_list = await get_rodin_download_list(task_uuid, cls)
|
||||
file_3d = await _download_gen25_files(download_list, task_uuid, geometry_file_format)
|
||||
return IO.NodeOutput(file_3d)
|
||||
|
||||
|
||||
class Rodin3D_Gen25_Text(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls) -> IO.Schema:
|
||||
return IO.Schema(
|
||||
node_id="Rodin3D_Gen25_Text",
|
||||
display_name="Rodin 3D Gen-2.5 - Text to 3D",
|
||||
category="api node/3d/Rodin",
|
||||
description=(
|
||||
"Generate a 3D model from a text prompt via Rodin Gen-2.5. "
|
||||
"Pick a mode (Fast / Regular / Extreme-High) to tune quality vs. cost."
|
||||
),
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Text prompt for the 3D model.",
|
||||
),
|
||||
_build_mode_input(),
|
||||
*_build_common_inputs(include_image_only=False),
|
||||
],
|
||||
outputs=[IO.File3DAny.Output(display_name="model_file")],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["mode", "addon_highpack"]),
|
||||
expr=_PRICE_EXPR,
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
prompt: str,
|
||||
mode: dict,
|
||||
material: str,
|
||||
geometry_file_format: str,
|
||||
texture_mode: str,
|
||||
seed: int,
|
||||
TAPose: bool,
|
||||
hd_texture: bool,
|
||||
texture_delight: bool,
|
||||
addon_highpack: bool,
|
||||
bbox_width: int,
|
||||
bbox_height: int,
|
||||
bbox_length: int,
|
||||
height_cm: int,
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, field_name="prompt", min_length=1, max_length=2500)
|
||||
request = _build_request(
|
||||
mode_input=mode,
|
||||
material=material,
|
||||
geometry_file_format=geometry_file_format,
|
||||
texture_mode=texture_mode,
|
||||
seed=seed,
|
||||
TAPose=TAPose,
|
||||
hd_texture=hd_texture,
|
||||
texture_delight=texture_delight,
|
||||
addon_highpack=addon_highpack,
|
||||
bbox_width=bbox_width,
|
||||
bbox_height=bbox_height,
|
||||
bbox_length=bbox_length,
|
||||
height_cm=height_cm,
|
||||
prompt=prompt,
|
||||
)
|
||||
task_uuid, subscription_key = await _create_gen25_task(cls, request, images=None)
|
||||
await poll_for_task_status(subscription_key, cls)
|
||||
download_list = await get_rodin_download_list(task_uuid, cls)
|
||||
file_3d = await _download_gen25_files(download_list, task_uuid, geometry_file_format)
|
||||
return IO.NodeOutput(file_3d)
|
||||
|
||||
|
||||
class Rodin3DExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
||||
@ -551,6 +1114,8 @@ class Rodin3DExtension(ComfyExtension):
|
||||
Rodin3D_Smooth,
|
||||
Rodin3D_Sketch,
|
||||
Rodin3D_Gen2,
|
||||
Rodin3D_Gen25_Image,
|
||||
Rodin3D_Gen25_Text,
|
||||
]
|
||||
|
||||
|
||||
|
||||
@ -16,16 +16,17 @@ from .conversions import (
|
||||
convert_mask_to_image,
|
||||
downscale_image_tensor,
|
||||
downscale_image_tensor_by_max_side,
|
||||
downscale_video_to_max_pixels,
|
||||
image_tensor_pair_to_batch,
|
||||
pil_to_bytesio,
|
||||
resize_mask_to_image,
|
||||
resize_video_to_pixel_budget,
|
||||
tensor_to_base64_string,
|
||||
tensor_to_bytesio,
|
||||
tensor_to_pil,
|
||||
text_filepath_to_base64_string,
|
||||
text_filepath_to_data_uri,
|
||||
trim_video,
|
||||
upscale_video_to_min_pixels,
|
||||
video_to_base64_string,
|
||||
)
|
||||
from .download_helpers import (
|
||||
@ -88,16 +89,17 @@ __all__ = [
|
||||
"convert_mask_to_image",
|
||||
"downscale_image_tensor",
|
||||
"downscale_image_tensor_by_max_side",
|
||||
"downscale_video_to_max_pixels",
|
||||
"image_tensor_pair_to_batch",
|
||||
"pil_to_bytesio",
|
||||
"resize_mask_to_image",
|
||||
"resize_video_to_pixel_budget",
|
||||
"tensor_to_base64_string",
|
||||
"tensor_to_bytesio",
|
||||
"tensor_to_pil",
|
||||
"text_filepath_to_base64_string",
|
||||
"text_filepath_to_data_uri",
|
||||
"trim_video",
|
||||
"upscale_video_to_min_pixels",
|
||||
"video_to_base64_string",
|
||||
# Validation utilities
|
||||
"get_image_dimensions",
|
||||
|
||||
@ -415,14 +415,48 @@ def trim_video(video: Input.Video, duration_sec: float) -> Input.Video:
|
||||
raise RuntimeError(f"Failed to trim video: {str(e)}") from e
|
||||
|
||||
|
||||
def resize_video_to_pixel_budget(video: Input.Video, total_pixels: int) -> Input.Video:
|
||||
"""Downscale a video to fit within ``total_pixels`` (w * h), preserving aspect ratio.
|
||||
def downscale_video_to_max_pixels(video: Input.Video, max_pixels: int) -> Input.Video:
|
||||
"""Downscale a video to fit within ``max_pixels`` (w * h), preserving aspect ratio.
|
||||
|
||||
Returns the original video object untouched when it already fits. Preserves frame rate, duration, and audio.
|
||||
Aspect ratio is preserved up to a fraction of a percent (even-dim rounding).
|
||||
"""
|
||||
src_w, src_h = video.get_dimensions()
|
||||
scale_dims = _compute_downscale_dims(src_w, src_h, total_pixels)
|
||||
scale_dims = _compute_downscale_dims(src_w, src_h, max_pixels)
|
||||
if scale_dims is None:
|
||||
return video
|
||||
return _apply_video_scale(video, scale_dims)
|
||||
|
||||
|
||||
def _compute_upscale_dims(src_w: int, src_h: int, total_pixels: int) -> tuple[int, int] | None:
|
||||
"""Return upscaled (w, h) with even dims meeting at least ``total_pixels``, or None if already large enough.
|
||||
|
||||
Source aspect ratio is preserved; output may drift by a fraction of a percent because both dimensions
|
||||
are rounded up to even values (many codecs require divisible-by-2). The result is guaranteed to be at
|
||||
least ``total_pixels``.
|
||||
"""
|
||||
pixels = src_w * src_h
|
||||
if pixels >= total_pixels:
|
||||
return None
|
||||
scale = math.sqrt(total_pixels / pixels)
|
||||
new_w = math.ceil(src_w * scale)
|
||||
new_h = math.ceil(src_h * scale)
|
||||
if new_w % 2:
|
||||
new_w += 1
|
||||
if new_h % 2:
|
||||
new_h += 1
|
||||
return new_w, new_h
|
||||
|
||||
|
||||
def upscale_video_to_min_pixels(video: Input.Video, min_pixels: int) -> Input.Video:
|
||||
"""Upscale a video to meet at least ``min_pixels`` (w * h), preserving aspect ratio.
|
||||
|
||||
Returns the original video object untouched when it already meets the minimum. Preserves frame rate,
|
||||
duration, and audio. Aspect ratio is preserved up to a fraction of a percent (even-dim rounding).
|
||||
Note: upscaling a low-resolution source does not add real detail; downstream model quality may suffer.
|
||||
"""
|
||||
src_w, src_h = video.get_dimensions()
|
||||
scale_dims = _compute_upscale_dims(src_w, src_h, min_pixels)
|
||||
if scale_dims is None:
|
||||
return video
|
||||
return _apply_video_scale(video, scale_dims)
|
||||
|
||||
@ -77,7 +77,7 @@ class EmptyLTXVLatentVideo(io.ComfyNode):
|
||||
@classmethod
|
||||
def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput:
|
||||
latent = torch.zeros([batch_size, 128, ((length - 1) // 8) + 1, height // 32, width // 32], device=comfy.model_management.intermediate_device())
|
||||
return io.NodeOutput({"samples": latent})
|
||||
return io.NodeOutput({"samples": latent, "downscale_ratio_spacial": 32})
|
||||
|
||||
generate = execute # TODO: remove
|
||||
|
||||
|
||||
@ -1,10 +1,41 @@
|
||||
import re
|
||||
import json
|
||||
import string
|
||||
from typing_extensions import override
|
||||
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
|
||||
|
||||
class StringFormat(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls) -> io.Schema:
|
||||
autogrow = io.Autogrow.TemplateNames(
|
||||
input=io.AnyType.Input("value"),
|
||||
names=list(string.ascii_lowercase),
|
||||
min=0,
|
||||
)
|
||||
return io.Schema(
|
||||
node_id="StringFormat",
|
||||
display_name="Format Text",
|
||||
category="text",
|
||||
search_aliases=["string", "format"],
|
||||
description="Same as Python's string format method. Supports all of Python's format options and features.",
|
||||
inputs=[
|
||||
io.Autogrow.Input("values", template=autogrow),
|
||||
io.String.Input("f_string", default="{a}", multiline=True),
|
||||
],
|
||||
outputs=[
|
||||
io.String.Output(),
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(
|
||||
cls, values: io.Autogrow.Type, f_string: str
|
||||
) -> io.NodeOutput:
|
||||
return io.NodeOutput(f_string.format(**values))
|
||||
|
||||
|
||||
class StringConcatenate(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
@ -413,6 +444,7 @@ class StringExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
StringFormat,
|
||||
StringConcatenate,
|
||||
StringSubstring,
|
||||
StringLength,
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
# This file is automatically generated by the build process when version is
|
||||
# updated in pyproject.toml.
|
||||
__version__ = "0.21.1"
|
||||
__version__ = "0.22.2"
|
||||
|
||||
22
openapi.yaml
22
openapi.yaml
@ -4160,6 +4160,10 @@ paths:
|
||||
name:
|
||||
type: string
|
||||
description: Display name for the API key
|
||||
description:
|
||||
type: string
|
||||
description: User-provided description of the key's purpose
|
||||
maxLength: 5000
|
||||
responses:
|
||||
"201":
|
||||
description: API key created
|
||||
@ -6351,14 +6355,6 @@ components:
|
||||
type: integer
|
||||
format: int64
|
||||
description: Size of the asset in bytes
|
||||
width:
|
||||
type: integer
|
||||
nullable: true
|
||||
description: "Original image width in pixels. Null for non-image assets or assets ingested before dimension extraction."
|
||||
height:
|
||||
type: integer
|
||||
nullable: true
|
||||
description: "Original image height in pixels. Null for non-image assets or assets ingested before dimension extraction."
|
||||
mime_type:
|
||||
type: string
|
||||
description: MIME type of the asset
|
||||
@ -7685,11 +7681,16 @@ components:
|
||||
required:
|
||||
- id
|
||||
- name
|
||||
- description
|
||||
properties:
|
||||
id:
|
||||
type: string
|
||||
name:
|
||||
type: string
|
||||
description:
|
||||
type: string
|
||||
maxLength: 5000
|
||||
description: User-provided description of the key's purpose. Always present in responses; empty string when no description was supplied on create.
|
||||
prefix:
|
||||
type: string
|
||||
description: First few characters of the key for identification
|
||||
@ -7710,12 +7711,17 @@ components:
|
||||
required:
|
||||
- id
|
||||
- name
|
||||
- description
|
||||
- key
|
||||
properties:
|
||||
id:
|
||||
type: string
|
||||
name:
|
||||
type: string
|
||||
description:
|
||||
type: string
|
||||
maxLength: 5000
|
||||
description: User-provided description of the key's purpose. Always present in responses; empty string when no description was supplied on create.
|
||||
key:
|
||||
type: string
|
||||
description: Full API key value (only returned on creation)
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "ComfyUI"
|
||||
version = "0.21.1"
|
||||
version = "0.22.2"
|
||||
readme = "README.md"
|
||||
license = { file = "LICENSE" }
|
||||
requires-python = ">=3.10"
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
comfyui-frontend-package==1.43.18
|
||||
comfyui-workflow-templates==0.9.77
|
||||
comfyui-workflow-templates==0.9.82
|
||||
comfyui-embedded-docs==0.5.0
|
||||
torch
|
||||
torchsde
|
||||
|
||||
Reference in New Issue
Block a user