Merge pull request #180 from pollockjj/fix-dlpack-p2p-cu130

pollockjj · web-flow · commit 8c4034f7e14c · 2026-03-17T22:58:36.000Z
Fix DLPack P2P Cross-Device Transfer (cu130)

I believe this fixes the issue. Ran on my own mixed-mode 5090/3090 setup and passed 0/0 1/1 0/1 1/0
diff --git a/__init__.py b/__init__.py
@@ -390,7 +390,7 @@ def sample_custom_with_runtime_device(model, *args, **kwargs):
         logger.info("[MultiGPU] Patched comfy.sample.sample_custom with runtime device guard")
 
 def _patch_comfy_kitchen_dlpack_device_guard():
-    """Guard comfy_kitchen DLPack export by switching to the tensor's CUDA device."""
+    """Guard comfy_kitchen DLPack export with P2P-aware CPU-staging fallback."""
     try:
         comfy_kitchen_cuda = importlib.import_module("comfy_kitchen.backends.cuda")
     except ImportError:
@@ -405,14 +405,43 @@ def _patch_comfy_kitchen_dlpack_device_guard():
     if getattr(wrap_for_dlpack, "_multigpu_cuda_device_guard", False):
         return True
 
+    from .p2p_registry import p2p_registry
+
     def wrap_for_dlpack_with_device_guard(*args, **kwargs):
         tensor = args[0] if args else kwargs.get("tensor")
-        with cuda_device_guard(getattr(tensor, "device", None), reason="comfy_kitchen._wrap_for_dlpack"):
-            return wrap_for_dlpack(*args, **kwargs)
+        tensor_device = getattr(tensor, "device", None)
+        exec_device = get_current_device()
+        exec_device = _coerce_torch_device(exec_device)
+
+        # Determine if cross-device staging is needed
+        needs_staging = False
+        def _valid_cuda(d):
+            return d is not None and d.type == "cuda" and d.index is not None
+
+        if _valid_cuda(tensor_device) and _valid_cuda(exec_device):
+            if tensor_device.index != exec_device.index and not p2p_registry.can_access_peer(tensor_device.index, exec_device.index):
+                needs_staging = True
+
+        if needs_staging:
+            logger.info(
+                f"[MultiGPU DLPack] CPU-staging tensor from cuda:{tensor_device.index} "
+                f"to cuda:{exec_device.index} (P2P unavailable)"
+            )
+            staged_tensor = tensor.to("cpu").to(exec_device)
+            wrap_for_dlpack_with_device_guard._dlpack_staging_count += 1
+            with cuda_device_guard(exec_device, reason="comfy_kitchen._wrap_for_dlpack(staged)"):
+                if args:
+                    return wrap_for_dlpack(staged_tensor, *args[1:], **kwargs)
+                else:
+                    return wrap_for_dlpack(staged_tensor, **kwargs)
+        else:
+            with cuda_device_guard(tensor_device, reason="comfy_kitchen._wrap_for_dlpack"):
+                return wrap_for_dlpack(*args, **kwargs)
 
     wrap_for_dlpack_with_device_guard._multigpu_cuda_device_guard = True
+    wrap_for_dlpack_with_device_guard._dlpack_staging_count = 0
     comfy_kitchen_cuda._wrap_for_dlpack = wrap_for_dlpack_with_device_guard
-    logger.info("[MultiGPU] Applied comfy_kitchen CUDA DLPack device guard patch")
+    logger.info("[MultiGPU] Applied comfy_kitchen CUDA DLPack device guard patch (P2P-aware)")
     return True
 
 logger.info("[MultiGPU Core Patching] Patching mm.get_torch_device, mm.text_encoder_device, mm.unet_offload_device")
diff --git a/p2p_registry.py b/p2p_registry.py
@@ -0,0 +1,75 @@
+"""P2P accessibility registry for multi-GPU DLPack operations.
+
+Caches cudaDeviceCanAccessPeer results per GPU pair to avoid
+repeated CUDA runtime API calls.
+"""
+
+import ctypes
+import logging
+import torch
+
+logger = logging.getLogger("MultiGPU")
+
+_libcudart = None
+
+
+def _get_libcudart():
+    """Load libcudart.so once and cache the handle."""
+    global _libcudart
+    if _libcudart is None:
+        _libcudart = ctypes.CDLL("libcudart.so")
+    return _libcudart
+
+
+class MultiGPUP2PRegistry:
+    """Cached registry for CUDA peer-to-peer accessibility between GPU pairs.
+
+    Uses the CUDA runtime cudaDeviceCanAccessPeer API directly via ctypes
+    because torch.cuda.can_access_peer does not exist in PyTorch 2.10.0+.
+    Results are cached per (src, dst) pair for the lifetime of the registry.
+    """
+
+    def __init__(self):
+        self._cache = {}
+
+    @staticmethod
+    def _raw_can_access_peer(device_a: int, device_b: int) -> bool:
+        """Call cudaDeviceCanAccessPeer via ctypes. Returns True if P2P is available."""
+        lib = _get_libcudart()
+        can_access = ctypes.c_int(0)
+        result = lib.cudaDeviceCanAccessPeer(ctypes.byref(can_access), device_a, device_b)
+        if result != 0:
+            logger.warning(
+                f"[MultiGPU P2P] cudaDeviceCanAccessPeer({device_a}, {device_b}) "
+                f"returned error code {result}, assuming no P2P"
+            )
+            return False
+        return bool(can_access.value)
+
+    def can_access_peer(self, src_device: int, dst_device: int) -> bool:
+        """Check if src_device can access dst_device memory via P2P.
+
+        Results are cached per (src, dst) pair.
+        """
+        if src_device == dst_device:
+            return True
+
+        key = (src_device, dst_device)
+        if key not in self._cache:
+            if not torch.cuda.is_available():
+                self._cache[key] = False
+            else:
+                result = self._raw_can_access_peer(src_device, dst_device)
+                self._cache[key] = result
+                logger.info(
+                    f"[MultiGPU P2P] can_access_peer({src_device}, {dst_device}) = {result}"
+                )
+        return self._cache[key]
+
+    def clear_cache(self):
+        """Clear the P2P cache (useful for testing)."""
+        self._cache.clear()
+
+
+# Module-level singleton
+p2p_registry = MultiGPUP2PRegistry()
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "comfyui-multigpu"
 description = "Provides a suite of custom nodes to manage multiple GPUs for ComfyUI, including advanced model offloading for both GGUF and Safetensor formats with DisTorch, and bespoke MultiGPU support for WanVideoWrapper and other custom nodes."
-version = "2.6.0"
+version = "2.6.1"
 license = {file = "LICENSE"}
 
 [project.urls]