Enable AIMDO DynamicVRAM and async offload on Intel XPU

- main.py: extend the DynamicVRAM enablement gate to is_intel_xpu() (was Nvidia-only) - model_management.py: add XPU-safe host_register/host_unregister helpers (no CUDA host-registration API on XPU; pinnable buffers are already Level Zero host USM) and route the cudaHostRegister/Unregister sites through them - model_management.py: add is_intel_xpu_discrete() which queries Level Zero (ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) via ctypes on both Windows (ze_loader.dll) and Linux (libze_loader.so.1), matching the active torch device by PCI deviceId; fail-closed on any error or ambiguity - model_management.py: enable async weight-offload streams (NUM_STREAMS=2) by default on discrete Intel XPU; user --async-offload/--disable-async-offload overrides preserved - model_patcher.py, pinned_memory.py: route remaining host (un)register calls through the XPU-safe helpers device_supports_non_blocking() is unchanged (XPU stays blocking): the ~15% async win comes from stream overlap, not non-blocking copies. Validated end-to-end on a discrete Intel Arc B570 (Windows, torch 2.10.0+xpu). Amp-Thread-ID: https://ampcode.com/threads/T-019ef7fa-0c6c-743e-b9c6-f9597ddcfa75 Co-authored-by: Amp <amp@ampcode.com>
2026-06-26 07:56:54 +08:00 · 2026-06-24 19:29:24 -07:00
8 changed files with 160 additions and 88 deletions
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -1274,13 +1274,148 @@ def force_channels_last():
    return False


+_INTEL_XPU_DISCRETE = None
+def is_intel_xpu_discrete():
+    # Returns True only if the active Intel XPU is a discrete GPU. torch.xpu does
+    # not expose the integrated-vs-discrete distinction, so we query Level Zero
+    # directly via ctypes. Works on Windows (ze_loader.dll) and Linux
+    # (libze_loader.so.1). Any failure or ambiguity returns False so a
+    # discrete-only fast path is never enabled by mistake.
+    global _INTEL_XPU_DISCRETE
+    if _INTEL_XPU_DISCRETE is not None:
+        return _INTEL_XPU_DISCRETE
+    _INTEL_XPU_DISCRETE = False
+    if not is_intel_xpu():
+        return False
+
+    try:
+        import ctypes
+        import ctypes.util
+
+        ZE_RESULT_SUCCESS = 0
+        ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x3
+        ZE_DEVICE_TYPE_GPU = 1
+        ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = 1 << 0
+        ZE_MAX_DEVICE_NAME = 256
+
+        class ze_device_uuid_t(ctypes.Structure):
+            _fields_ = [("id", ctypes.c_ubyte * 16)]
+
+        class ze_device_properties_t(ctypes.Structure):
+            _fields_ = [
+                ("stype", ctypes.c_uint32),
+                ("pNext", ctypes.c_void_p),
+                ("type", ctypes.c_uint32),
+                ("vendorId", ctypes.c_uint32),
+                ("deviceId", ctypes.c_uint32),
+                ("flags", ctypes.c_uint32),
+                ("subdeviceId", ctypes.c_uint32),
+                ("coreClockRate", ctypes.c_uint32),
+                ("maxMemAllocSize", ctypes.c_uint64),
+                ("maxHardwareContexts", ctypes.c_uint32),
+                ("maxCommandQueuePriority", ctypes.c_uint32),
+                ("numThreadsPerEU", ctypes.c_uint32),
+                ("physicalEUSimdWidth", ctypes.c_uint32),
+                ("numEUsPerSubslice", ctypes.c_uint32),
+                ("numSubslicesPerSlice", ctypes.c_uint32),
+                ("numSlices", ctypes.c_uint32),
+                ("timerResolution", ctypes.c_uint64),
+                ("timestampValidBits", ctypes.c_uint32),
+                ("kernelTimestampValidBits", ctypes.c_uint32),
+                ("uuid", ze_device_uuid_t),
+                ("name", ctypes.c_char * ZE_MAX_DEVICE_NAME),
+            ]
+
+        if sys.platform == "win32":
+            loader_names = ["ze_loader.dll"]
+        else:
+            loader_names = [ctypes.util.find_library("ze_loader"), "libze_loader.so.1", "libze_loader.so"]
+
+        ze = None
+        for name in loader_names:
+            if not name:
+                continue
+            try:
+                ze = ctypes.CDLL(name)
+                break
+            except OSError:
+                pass
+        if ze is None:
+            return False
+
+        ze.zeInit.argtypes = [ctypes.c_uint32]
+        ze.zeInit.restype = ctypes.c_uint32
+        ze.zeDriverGet.argtypes = [ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.c_void_p)]
+        ze.zeDriverGet.restype = ctypes.c_uint32
+        ze.zeDeviceGet.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.c_void_p)]
+        ze.zeDeviceGet.restype = ctypes.c_uint32
+        ze.zeDeviceGetProperties.argtypes = [ctypes.c_void_p, ctypes.POINTER(ze_device_properties_t)]
+        ze.zeDeviceGetProperties.restype = ctypes.c_uint32
+
+        if ze.zeInit(0) != ZE_RESULT_SUCCESS:
+            return False
+
+        try:
+            torch_device_id = int(torch.xpu.get_device_properties(torch.xpu.current_device()).device_id)
+        except Exception:
+            torch_device_id = None
+
+        driver_count = ctypes.c_uint32(0)
+        if ze.zeDriverGet(ctypes.byref(driver_count), None) != ZE_RESULT_SUCCESS or driver_count.value == 0:
+            return False
+        allocated_drivers = driver_count.value
+        drivers = (ctypes.c_void_p * allocated_drivers)()
+        if ze.zeDriverGet(ctypes.byref(driver_count), drivers) != ZE_RESULT_SUCCESS:
+            return False
+
+        gpu_devices = []  # (deviceId, is_integrated)
+        for i in range(min(driver_count.value, allocated_drivers)):
+            device_count = ctypes.c_uint32(0)
+            if ze.zeDeviceGet(drivers[i], ctypes.byref(device_count), None) != ZE_RESULT_SUCCESS:
+                return False
+            if device_count.value == 0:
+                continue
+            allocated_devices = device_count.value
+            devices = (ctypes.c_void_p * allocated_devices)()
+            if ze.zeDeviceGet(drivers[i], ctypes.byref(device_count), devices) != ZE_RESULT_SUCCESS:
+                return False
+            for j in range(min(device_count.value, allocated_devices)):
+                props = ze_device_properties_t()
+                props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES
+                props.pNext = None
+                if ze.zeDeviceGetProperties(devices[j], ctypes.byref(props)) != ZE_RESULT_SUCCESS:
+                    return False
+                if props.type != ZE_DEVICE_TYPE_GPU:
+                    continue
+                gpu_devices.append((int(props.deviceId), bool(props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED)))
+
+        if not gpu_devices:
+            return False
+
+        if torch_device_id is not None:
+            matches = [integrated for device_id, integrated in gpu_devices if device_id == torch_device_id]
+            if matches:
+                # Fail closed if a duplicate PCI device id somehow mixes flags.
+                _INTEL_XPU_DISCRETE = not any(matches)
+                return _INTEL_XPU_DISCRETE
+
+        # No reliable match: only enable when every visible GPU is discrete so a
+        # mixed iGPU+dGPU system never enables streams while running on the iGPU.
+        _INTEL_XPU_DISCRETE = all(not integrated for _, integrated in gpu_devices)
+        return _INTEL_XPU_DISCRETE
+    except Exception as e:
+        logging.info("Could not determine Intel XPU type via Level Zero: {}".format(e))
+        _INTEL_XPU_DISCRETE = False
+        return False
+
+
 STREAMS = {}
 NUM_STREAMS = 0
 if args.async_offload is not None:
    NUM_STREAMS = args.async_offload
 else:
-    #  Enable by default on Nvidia and AMD
-    if is_nvidia() or is_amd():
+    #  Enable by default on Nvidia, AMD, and discrete Intel XPU
+    if not args.disable_async_offload and (is_nvidia() or is_amd() or is_intel_xpu_discrete()):
        NUM_STREAMS = 2

 if args.disable_async_offload:
@ -1487,7 +1622,7 @@ PINNED_MEMORY = {}
 TOTAL_PINNED_MEMORY = 0
 MAX_PINNED_MEMORY = -1
 if not args.disable_pinned_memory:
-    if is_nvidia() or is_amd():
+    if is_nvidia() or is_amd() or is_intel_xpu():
        ram = get_total_memory(torch.device("cpu"))
        if WINDOWS:
            MAX_PINNED_MEMORY = ram * 0.40  # Windows limit is apparently 50%
@ -1512,6 +1647,20 @@ def discard_cuda_async_error():
        #Dump it! We already know about it from the synchronous return
        pass

+def host_register(ptr, size):
+    # Intel XPU has no CUDA host-registration API. The pinnable buffers used by
+    # the DynamicVRAM path are already Level Zero host USM (allocated through the
+    # aimdo hostbuf / zeMemAllocHost), and pageable host memory is still usable
+    # for transfers, so registration is a no-op success on XPU.
+    if is_intel_xpu():
+        return 0
+    return torch.cuda.cudart().cudaHostRegister(ptr, size, 1)
+
+def host_unregister(ptr):
+    if is_intel_xpu():
+        return 0
+    return torch.cuda.cudart().cudaHostUnregister(ptr)
+
 def pin_memory(tensor):
    global TOTAL_PINNED_MEMORY
    if MAX_PINNED_MEMORY <= 0:
@ -1540,7 +1689,7 @@ def pin_memory(tensor):
    if ptr == 0:
        return False

-    if torch.cuda.cudart().cudaHostRegister(ptr, size, 1) == 0:
+    if host_register(ptr, size) == 0:
        PINNED_MEMORY[ptr] = size
        TOTAL_PINNED_MEMORY += size
        return True
@ -1570,7 +1719,7 @@ def unpin_memory(tensor):
        logging.warning("Size of pinned tensor changed")
        return False

-    if torch.cuda.cudart().cudaHostUnregister(ptr) == 0:
+    if host_unregister(ptr) == 0:
        size = PINNED_MEMORY.pop(ptr)
        TOTAL_PINNED_MEMORY -= size
        return True
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@ -1961,7 +1961,7 @@ class ModelPatcherDynamic(ModelPatcher):
                if not module._pin_registered:
                    continue
                size = module._pin.numel() * module._pin.element_size()
-                if torch.cuda.cudart().cudaHostUnregister(module._pin.data_ptr()) != 0:
+                if comfy.model_management.host_unregister(module._pin.data_ptr()) != 0:
                    comfy.model_management.discard_cuda_async_error()
                    continue
                module._pin_registered = False
--- a/comfy/pinned_memory.py
+++ b/comfy/pinned_memory.py
@ -53,7 +53,7 @@ def get_pin(module, subset="weights"):
    size = pin.nbytes
    comfy.model_management.ensure_pin_registerable(size)

-    if torch.cuda.cudart().cudaHostRegister(pin.data_ptr(), size, 1) != 0:
+    if comfy.model_management.host_register(pin.data_ptr(), size) != 0:
        comfy.model_management.discard_cuda_async_error()
        return pin

@ -95,10 +95,10 @@ def pin_memory(module, subset="weights", size=None):
        extended = True
        pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size]
        pin.untyped_storage()._comfy_hostbuf = hostbuf
-        if torch.cuda.cudart().cudaHostRegister(pin.data_ptr(), size, 1) != 0:
+        if comfy.model_management.host_register(pin.data_ptr(), size) != 0:
            comfy.model_management.discard_cuda_async_error()
            comfy.model_management.free_registrations(size)
-            if torch.cuda.cudart().cudaHostRegister(pin.data_ptr(), size, 1) != 0:
+            if comfy.model_management.host_register(pin.data_ptr(), size) != 0:
                comfy.model_management.discard_cuda_async_error()
                del pin
                hostbuf.truncate(offset, do_unregister=False)
--- a/comfy_extras/nodes_model_merging_model_specific.py
+++ b/comfy_extras/nodes_model_merging_model_specific.py
@ -337,36 +337,6 @@ class ModelMergeQwenImage(comfy_extras.nodes_model_merging.ModelMergeBlocks):

        return {"required": arg_dict}

-class ModelMergeKrea2(comfy_extras.nodes_model_merging.ModelMergeBlocks):
-    CATEGORY = "model/merging/model specific"
-
-    @classmethod
-    def INPUT_TYPES(s):
-        arg_dict = { "model1": ("MODEL",),
-                              "model2": ("MODEL",)}
-
-        argument = ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01})
-
-        arg_dict["first."] = argument
-        arg_dict["tmlp."] = argument
-        arg_dict["txtmlp."] = argument
-        arg_dict["tproj."] = argument
-
-        for i in range(2):
-            arg_dict["txtfusion.layerwise_blocks.{}.".format(i)] = argument
-
-        arg_dict["txtfusion.projector."] = argument
-
-        for i in range(2):
-            arg_dict["txtfusion.refiner_blocks.{}.".format(i)] = argument
-
-        for i in range(28):
-            arg_dict["blocks.{}.".format(i)] = argument
-
-        arg_dict["last."] = argument
-
-        return {"required": arg_dict}
-
 NODE_CLASS_MAPPINGS = {
    "ModelMergeSD1": ModelMergeSD1,
    "ModelMergeSD2": ModelMergeSD1, #SD1 and SD2 have the same blocks
@ -383,5 +353,4 @@ NODE_CLASS_MAPPINGS = {
    "ModelMergeCosmosPredict2_2B": ModelMergeCosmosPredict2_2B,
    "ModelMergeCosmosPredict2_14B": ModelMergeCosmosPredict2_14B,
    "ModelMergeQwenImage": ModelMergeQwenImage,
-    "ModelMergeKrea2": ModelMergeKrea2,
 }
--- a/comfy_extras/nodes_seed.py
+++ b/comfy_extras/nodes_seed.py
@ -1,33 +0,0 @@
-import sys
-from typing_extensions import override
-
-from comfy_api.latest import ComfyExtension, io
-
-
-class SeedNode(io.ComfyNode):
-    @classmethod
-    def define_schema(cls):
-        return io.Schema(
-            node_id="SeedNode",
-            display_name="Seed",
-            search_aliases=["seed", "random"],
-            category="utilities",
-            inputs=[
-                io.Int.Input("seed", min=0, max=sys.maxsize, control_after_generate=io.ControlAfterGenerate.fixed),
-            ],
-            outputs=[io.Int.Output(display_name="seed")],
-        )
-
-    @classmethod
-    def execute(cls, seed: int) -> io.NodeOutput:
-        return io.NodeOutput(seed)
-
-
-class SeedExtension(ComfyExtension):
-    @override
-    async def get_node_list(self) -> list[type[io.ComfyNode]]:
-        return [SeedNode]
-
-
-async def comfy_entrypoint() -> SeedExtension:
-    return SeedExtension()
--- a/main.py
+++ b/main.py
@ -236,7 +236,7 @@ import hook_breaker_ac10a0
 import comfy.memory_management
 import comfy.model_patcher

-if args.enable_dynamic_vram or (enables_dynamic_vram() and comfy.model_management.is_nvidia() and not comfy.model_management.is_wsl()):
+if args.enable_dynamic_vram or (enables_dynamic_vram() and (comfy.model_management.is_nvidia() or comfy.model_management.is_intel_xpu()) and not comfy.model_management.is_wsl()):
    if (not args.enable_dynamic_vram) and (comfy.model_management.torch_version_numeric < (2, 8)):
        logging.warning("Unsupported Pytorch detected. DynamicVRAM support requires Pytorch version 2.8 or later. Falling back to legacy ModelPatcher. VRAM estimates may be unreliable especially on Windows")
    else:
--- a/nodes.py
+++ b/nodes.py
@ -2473,7 +2473,6 @@ async def init_builtin_extra_nodes():
        "nodes_gaussian_splat.py",
        "nodes_triposplat.py",
        "nodes_depth_anything_3.py",
-        "nodes_seed.py",
    ]

    import_failed = []
--- a/openapi.yaml
+++ b/openapi.yaml
@ -1692,12 +1692,6 @@ paths:
                            schema:
                                $ref: '#/components/schemas/ErrorResponse'
                    description: Unsupported media type
-                "422":
-                    content:
-                        application/json:
-                            schema:
-                                $ref: '#/components/schemas/ErrorResponse'
-                    description: Validation error (e.g., disallowed model_type tag)
                "500":
                    content:
                        application/json:
@ -2143,12 +2137,6 @@ paths:
                            schema:
                                $ref: '#/components/schemas/ErrorResponse'
                    description: Source asset with given hash not found
-                "422":
-                    content:
-                        application/json:
-                            schema:
-                                $ref: '#/components/schemas/ErrorResponse'
-                    description: Validation error (e.g., disallowed model_type tag)
                "500":
                    content:
                        application/json:
@ -3004,7 +2992,7 @@ paths:
                    format: uuid
                    type: string
                - description: |
-                    When present, each output item in the response receives a `short_url` field containing a short link for that asset. Omit this parameter (the default) to receive a response identical to the no-param baseline. The value selects the link's lifetime and auth model: use `ephemeral_tool_chain` for short-lived (≤5 minute) machine-to-machine handoffs — these are public bearer links where the link ID itself is the credential, so anyone holding the link can resolve it (intended for pasting into an agent/MCP tool chain); use `default` for durable (30 day) human-revisitable links, which are owner-gated and resolvable only by the authenticated owner. Links are always minted under the authenticated request owner's identity; the auth model is selected by the server and is never settable by the caller.
+                    When present, each output item in the response receives a `short_url` field containing an owner-gated durable link for that asset. Omit this parameter (the default) to receive a response identical to the no-param baseline. The value selects the link's lifetime: use `ephemeral_tool_chain` for short-lived machine-to-machine handoffs (~15 minutes); use `default` for durable human-revisitable links (30 days). Links are minted only for the authenticated request owner and are not resolvable by other users.
                  in: query
                  name: short_link
                  schema: