Compare commits

..

8 Commits

10 changed files with 41 additions and 169 deletions

View File

@ -1274,148 +1274,13 @@ def force_channels_last():
return False
_INTEL_XPU_DISCRETE = None
def is_intel_xpu_discrete():
# Returns True only if the active Intel XPU is a discrete GPU. torch.xpu does
# not expose the integrated-vs-discrete distinction, so we query Level Zero
# directly via ctypes. Works on Windows (ze_loader.dll) and Linux
# (libze_loader.so.1). Any failure or ambiguity returns False so a
# discrete-only fast path is never enabled by mistake.
global _INTEL_XPU_DISCRETE
if _INTEL_XPU_DISCRETE is not None:
return _INTEL_XPU_DISCRETE
_INTEL_XPU_DISCRETE = False
if not is_intel_xpu():
return False
try:
import ctypes
import ctypes.util
ZE_RESULT_SUCCESS = 0
ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x3
ZE_DEVICE_TYPE_GPU = 1
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = 1 << 0
ZE_MAX_DEVICE_NAME = 256
class ze_device_uuid_t(ctypes.Structure):
_fields_ = [("id", ctypes.c_ubyte * 16)]
class ze_device_properties_t(ctypes.Structure):
_fields_ = [
("stype", ctypes.c_uint32),
("pNext", ctypes.c_void_p),
("type", ctypes.c_uint32),
("vendorId", ctypes.c_uint32),
("deviceId", ctypes.c_uint32),
("flags", ctypes.c_uint32),
("subdeviceId", ctypes.c_uint32),
("coreClockRate", ctypes.c_uint32),
("maxMemAllocSize", ctypes.c_uint64),
("maxHardwareContexts", ctypes.c_uint32),
("maxCommandQueuePriority", ctypes.c_uint32),
("numThreadsPerEU", ctypes.c_uint32),
("physicalEUSimdWidth", ctypes.c_uint32),
("numEUsPerSubslice", ctypes.c_uint32),
("numSubslicesPerSlice", ctypes.c_uint32),
("numSlices", ctypes.c_uint32),
("timerResolution", ctypes.c_uint64),
("timestampValidBits", ctypes.c_uint32),
("kernelTimestampValidBits", ctypes.c_uint32),
("uuid", ze_device_uuid_t),
("name", ctypes.c_char * ZE_MAX_DEVICE_NAME),
]
if sys.platform == "win32":
loader_names = ["ze_loader.dll"]
else:
loader_names = [ctypes.util.find_library("ze_loader"), "libze_loader.so.1", "libze_loader.so"]
ze = None
for name in loader_names:
if not name:
continue
try:
ze = ctypes.CDLL(name)
break
except OSError:
pass
if ze is None:
return False
ze.zeInit.argtypes = [ctypes.c_uint32]
ze.zeInit.restype = ctypes.c_uint32
ze.zeDriverGet.argtypes = [ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.c_void_p)]
ze.zeDriverGet.restype = ctypes.c_uint32
ze.zeDeviceGet.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.c_void_p)]
ze.zeDeviceGet.restype = ctypes.c_uint32
ze.zeDeviceGetProperties.argtypes = [ctypes.c_void_p, ctypes.POINTER(ze_device_properties_t)]
ze.zeDeviceGetProperties.restype = ctypes.c_uint32
if ze.zeInit(0) != ZE_RESULT_SUCCESS:
return False
try:
torch_device_id = int(torch.xpu.get_device_properties(torch.xpu.current_device()).device_id)
except Exception:
torch_device_id = None
driver_count = ctypes.c_uint32(0)
if ze.zeDriverGet(ctypes.byref(driver_count), None) != ZE_RESULT_SUCCESS or driver_count.value == 0:
return False
allocated_drivers = driver_count.value
drivers = (ctypes.c_void_p * allocated_drivers)()
if ze.zeDriverGet(ctypes.byref(driver_count), drivers) != ZE_RESULT_SUCCESS:
return False
gpu_devices = [] # (deviceId, is_integrated)
for i in range(min(driver_count.value, allocated_drivers)):
device_count = ctypes.c_uint32(0)
if ze.zeDeviceGet(drivers[i], ctypes.byref(device_count), None) != ZE_RESULT_SUCCESS:
return False
if device_count.value == 0:
continue
allocated_devices = device_count.value
devices = (ctypes.c_void_p * allocated_devices)()
if ze.zeDeviceGet(drivers[i], ctypes.byref(device_count), devices) != ZE_RESULT_SUCCESS:
return False
for j in range(min(device_count.value, allocated_devices)):
props = ze_device_properties_t()
props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES
props.pNext = None
if ze.zeDeviceGetProperties(devices[j], ctypes.byref(props)) != ZE_RESULT_SUCCESS:
return False
if props.type != ZE_DEVICE_TYPE_GPU:
continue
gpu_devices.append((int(props.deviceId), bool(props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED)))
if not gpu_devices:
return False
if torch_device_id is not None:
matches = [integrated for device_id, integrated in gpu_devices if device_id == torch_device_id]
if matches:
# Fail closed if a duplicate PCI device id somehow mixes flags.
_INTEL_XPU_DISCRETE = not any(matches)
return _INTEL_XPU_DISCRETE
# No reliable match: only enable when every visible GPU is discrete so a
# mixed iGPU+dGPU system never enables streams while running on the iGPU.
_INTEL_XPU_DISCRETE = all(not integrated for _, integrated in gpu_devices)
return _INTEL_XPU_DISCRETE
except Exception as e:
logging.info("Could not determine Intel XPU type via Level Zero: {}".format(e))
_INTEL_XPU_DISCRETE = False
return False
STREAMS = {}
NUM_STREAMS = 0
if args.async_offload is not None:
NUM_STREAMS = args.async_offload
else:
# Enable by default on Nvidia, AMD, and discrete Intel XPU
if not args.disable_async_offload and (is_nvidia() or is_amd() or is_intel_xpu_discrete()):
# Enable by default on Nvidia and AMD
if is_nvidia() or is_amd():
NUM_STREAMS = 2
if args.disable_async_offload:
@ -1622,7 +1487,7 @@ PINNED_MEMORY = {}
TOTAL_PINNED_MEMORY = 0
MAX_PINNED_MEMORY = -1
if not args.disable_pinned_memory:
if is_nvidia() or is_amd() or is_intel_xpu():
if is_nvidia() or is_amd():
ram = get_total_memory(torch.device("cpu"))
if WINDOWS:
MAX_PINNED_MEMORY = ram * 0.40 # Windows limit is apparently 50%
@ -1647,20 +1512,6 @@ def discard_cuda_async_error():
#Dump it! We already know about it from the synchronous return
pass
def host_register(ptr, size):
# Intel XPU has no CUDA host-registration API. The pinnable buffers used by
# the DynamicVRAM path are already Level Zero host USM (allocated through the
# aimdo hostbuf / zeMemAllocHost), and pageable host memory is still usable
# for transfers, so registration is a no-op success on XPU.
if is_intel_xpu():
return 0
return torch.cuda.cudart().cudaHostRegister(ptr, size, 1)
def host_unregister(ptr):
if is_intel_xpu():
return 0
return torch.cuda.cudart().cudaHostUnregister(ptr)
def pin_memory(tensor):
global TOTAL_PINNED_MEMORY
if MAX_PINNED_MEMORY <= 0:
@ -1689,7 +1540,7 @@ def pin_memory(tensor):
if ptr == 0:
return False
if host_register(ptr, size) == 0:
if torch.cuda.cudart().cudaHostRegister(ptr, size, 1) == 0:
PINNED_MEMORY[ptr] = size
TOTAL_PINNED_MEMORY += size
return True
@ -1719,7 +1570,7 @@ def unpin_memory(tensor):
logging.warning("Size of pinned tensor changed")
return False
if host_unregister(ptr) == 0:
if torch.cuda.cudart().cudaHostUnregister(ptr) == 0:
size = PINNED_MEMORY.pop(ptr)
TOTAL_PINNED_MEMORY -= size
return True

View File

@ -1961,7 +1961,7 @@ class ModelPatcherDynamic(ModelPatcher):
if not module._pin_registered:
continue
size = module._pin.numel() * module._pin.element_size()
if comfy.model_management.host_unregister(module._pin.data_ptr()) != 0:
if torch.cuda.cudart().cudaHostUnregister(module._pin.data_ptr()) != 0:
comfy.model_management.discard_cuda_async_error()
continue
module._pin_registered = False

View File

@ -53,7 +53,7 @@ def get_pin(module, subset="weights"):
size = pin.nbytes
comfy.model_management.ensure_pin_registerable(size)
if comfy.model_management.host_register(pin.data_ptr(), size) != 0:
if torch.cuda.cudart().cudaHostRegister(pin.data_ptr(), size, 1) != 0:
comfy.model_management.discard_cuda_async_error()
return pin
@ -95,10 +95,10 @@ def pin_memory(module, subset="weights", size=None):
extended = True
pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size]
pin.untyped_storage()._comfy_hostbuf = hostbuf
if comfy.model_management.host_register(pin.data_ptr(), size) != 0:
if torch.cuda.cudart().cudaHostRegister(pin.data_ptr(), size, 1) != 0:
comfy.model_management.discard_cuda_async_error()
comfy.model_management.free_registrations(size)
if comfy.model_management.host_register(pin.data_ptr(), size) != 0:
if torch.cuda.cudart().cudaHostRegister(pin.data_ptr(), size, 1) != 0:
comfy.model_management.discard_cuda_async_error()
del pin
hostbuf.truncate(offset, do_unregister=False)

View File

@ -177,6 +177,10 @@ SEEDANCE2_PRICE_PER_1K_TOKENS = {
("dreamina-seedance-2-0-fast-260128", True, "480p"): 0.0033,
("dreamina-seedance-2-0-fast-260128", False, "720p"): 0.0056,
("dreamina-seedance-2-0-fast-260128", True, "720p"): 0.0033,
("dreamina-seedance-2-0-mini", False, "480p"): 0.0035,
("dreamina-seedance-2-0-mini", True, "480p"): 0.0021,
("dreamina-seedance-2-0-mini", False, "720p"): 0.0035,
("dreamina-seedance-2-0-mini", True, "720p"): 0.0021,
}
@ -278,6 +282,10 @@ SEEDANCE2_REF_VIDEO_PIXEL_LIMITS = {
"480p": {"min": 409_600, "max": 927_408},
"720p": {"min": 409_600, "max": 927_408},
},
"dreamina-seedance-2-0-mini": {
"480p": {"min": 409_600, "max": 927_408},
"720p": {"min": 409_600, "max": 927_408},
},
}
# The time in this dictionary are given for 10 seconds duration.

View File

@ -89,6 +89,7 @@ BYTEPLUS_SEEDANCE2_TASK_STATUS_ENDPOINT = "/proxy/byteplus-seedance2/api/v3/cont
SEEDANCE_MODELS = {
"Seedance 2.0": "dreamina-seedance-2-0-260128",
"Seedance 2.0 Fast": "dreamina-seedance-2-0-fast-260128",
"Seedance 2.0 Mini": "dreamina-seedance-2-0-mini",
}
DEPRECATED_MODELS = {"seedance-1-0-lite-t2v-250428", "seedance-1-0-lite-i2v-250428"}
@ -1623,8 +1624,10 @@ class ByteDance2TextToVideoNode(IO.ComfyNode):
options=[
IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs(["480p", "720p", "1080p", "4k"])),
IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs(["480p", "720p"])),
IO.DynamicCombo.Option("Seedance 2.0 Mini", _seedance2_text_inputs(["480p", "720p"])),
],
tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
tooltip="Seedance 2.0 for maximum quality; Fast for speed optimization; "
"Mini for the fastest, lowest-cost generation.",
),
IO.Int.Input(
"seed",
@ -1666,6 +1669,7 @@ class ByteDance2TextToVideoNode(IO.ComfyNode):
$dur := $lookup(widgets, "model.duration");
$pricePer1K := $res = "4k" ? 0.00572 :
$res = "1080p" ? 0.011011 :
$contains($m, "mini") ? 0.005005 :
$contains($m, "fast") ? 0.008008 : 0.01001;
$rate := $res = "4k" ? $rate4k :
$res = "1080p" ? $rate1080 :
@ -1734,8 +1738,13 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
"Seedance 2.0 Fast",
_seedance2_text_inputs(["480p", "720p"], default_ratio="adaptive"),
),
IO.DynamicCombo.Option(
"Seedance 2.0 Mini",
_seedance2_text_inputs(["480p", "720p"], default_ratio="adaptive"),
),
],
tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
tooltip="Seedance 2.0 for maximum quality; Fast for speed optimization; "
"Mini for the fastest, lowest-cost generation.",
),
IO.Image.Input(
"first_frame",
@ -1801,6 +1810,7 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
$dur := $lookup(widgets, "model.duration");
$pricePer1K := $res = "4k" ? 0.00572 :
$res = "1080p" ? 0.011011 :
$contains($m, "mini") ? 0.005005 :
$contains($m, "fast") ? 0.008008 : 0.01001;
$rate := $res = "4k" ? $rate4k :
$res = "1080p" ? $rate1080 :
@ -2024,8 +2034,13 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
"Seedance 2.0 Fast",
_seedance2_reference_inputs(["480p", "720p"], default_ratio="adaptive"),
),
IO.DynamicCombo.Option(
"Seedance 2.0 Mini",
_seedance2_reference_inputs(["480p", "720p"], default_ratio="adaptive"),
),
],
tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
tooltip="Seedance 2.0 for maximum quality; Fast for speed optimization; "
"Mini for the fastest, lowest-cost generation.",
),
IO.Int.Input(
"seed",
@ -2071,9 +2086,11 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
$dur := $lookup(widgets, "model.duration");
$noVideoPricePer1K := $res = "4k" ? 0.00572 :
$res = "1080p" ? 0.011011 :
$contains($m, "mini") ? 0.005005 :
$contains($m, "fast") ? 0.008008 : 0.01001;
$videoPricePer1K := $res = "4k" ? 0.003432 :
$res = "1080p" ? 0.006721 :
$contains($m, "mini") ? 0.003003 :
$contains($m, "fast") ? 0.004719 : 0.006149;
$rate := $res = "4k" ? $rate4k :
$res = "1080p" ? $rate1080 :

View File

@ -1,3 +1,3 @@
# This file is automatically generated by the build process when version is
# updated in pyproject.toml.
__version__ = "0.26.0"
__version__ = "0.26.2"

View File

@ -236,7 +236,7 @@ import hook_breaker_ac10a0
import comfy.memory_management
import comfy.model_patcher
if args.enable_dynamic_vram or (enables_dynamic_vram() and (comfy.model_management.is_nvidia() or comfy.model_management.is_intel_xpu()) and not comfy.model_management.is_wsl()):
if args.enable_dynamic_vram or (enables_dynamic_vram() and comfy.model_management.is_nvidia() and not comfy.model_management.is_wsl()):
if (not args.enable_dynamic_vram) and (comfy.model_management.torch_version_numeric < (2, 8)):
logging.warning("Unsupported Pytorch detected. DynamicVRAM support requires Pytorch version 2.8 or later. Falling back to legacy ModelPatcher. VRAM estimates may be unreliable especially on Windows")
else:

View File

@ -2357,10 +2357,6 @@ paths:
description: |
Returns a list of model folders available in the system.
This is an experimental endpoint that replaces the legacy /models endpoint.
Each folder's name is the identifier to pass to /api/experiment/models/{folder}.
Once the model_type migration is active the names are model_type folder_names
(e.g. `ultralytics_bbox`); a folder with no folder_name mapping is returned by
its directory path.
operationId: getModelFolders
responses:
"200":

View File

@ -1,6 +1,6 @@
[project]
name = "ComfyUI"
version = "0.26.0"
version = "0.26.2"
readme = "README.md"
license = { file = "LICENSE" }
requires-python = ">=3.10"

View File

@ -1,5 +1,5 @@
comfyui-frontend-package==1.45.19
comfyui-workflow-templates==0.10.2
comfyui-workflow-templates==0.10.7
comfyui-embedded-docs==0.5.5
torch
torchsde