Additional garbage/cache collection (#101) addressed DisTorch2 Device issue for CLIP hopefully closing (#99,#104)

pollockjj · pollockjj · commit c63b539f1efa · 2025-09-08T23:06:21.000-05:00
Add comprehensive memory cache clearing aligned with ComfyUI patterns to improve stability and reduce OOM incidents in multi-device scenarios.

**Addresses Memory/Garbage Collection Issues:**
- Created `soft_empty_cache_multigpu()` function in device_utils.py
- Replicates ComfyUI's cache clearing for all devices (CUDA, MPS, XPU, NPU, MLU)
- Includes CUDA IPC collect optimization like ComfyUI
- Strategically placed calls before major memory allocations

**Addresses CLIP loading issues:**
- Fixed DisTorch2 device device varibale management before text encoder operations

**`soft_empty_cache_multigpu()` implementation Aligned with ComfyUI's Patterns:**
- Called after GC operations
- Placed before major memory allocations
- Matches ComfyUI's proven memory management strategy
- Same device clearing logic for multi-device scenarios
diff --git a/__init__.py b/__init__.py
@@ -29,14 +29,15 @@
 # Global device state management
 current_device = mm.get_torch_device()
 current_text_encoder_device = mm.text_encoder_device()
+current_text_encoder_initial_device = mm.text_encoder_device()
 
 def set_current_device(device):
     global current_device
     current_device = device
     logger.info(f"[MultiGPU Initialization] current_device set to: {device}")
 
 def set_current_text_encoder_device(device):
-    global current_text_encoder_device
+    global current_text_encoder_device, current_text_encoder_initial_device
     current_text_encoder_device = device
     current_text_encoder_initial_device = device
     logger.info(f"[MultiGPU Initialization] current_text_encoder_device and current_text_encoder_initial_device set to: {device}")
@@ -192,7 +193,8 @@ def check_module_exists(module_path):
     register_patched_safetensor_modelpatcher,
     analyze_safetensor_loading,
     calculate_safetensor_vvram_allocation,
-    override_class_with_distorch_safetensor_v2
+    override_class_with_distorch_safetensor_v2,
+    override_class_with_distorch_safetensor_v2_clip
 )
 
 # Import advanced checkpoint loaders
@@ -229,13 +231,13 @@ def check_module_exists(module_path):
 # DisTorch 2 SafeTensor nodes for FLUX and other safetensor models
 NODE_CLASS_MAPPINGS["UNETLoaderDisTorch2MultiGPU"] = override_class_with_distorch_safetensor_v2(GLOBAL_NODE_CLASS_MAPPINGS["UNETLoader"])
 NODE_CLASS_MAPPINGS["VAELoaderDisTorch2MultiGPU"] = override_class_with_distorch_safetensor_v2(GLOBAL_NODE_CLASS_MAPPINGS["VAELoader"])
-NODE_CLASS_MAPPINGS["CLIPLoaderDisTorch2MultiGPU"] = override_class_with_distorch_safetensor_v2(GLOBAL_NODE_CLASS_MAPPINGS["CLIPLoader"])
-NODE_CLASS_MAPPINGS["DualCLIPLoaderDisTorch2MultiGPU"] = override_class_with_distorch_safetensor_v2(GLOBAL_NODE_CLASS_MAPPINGS["DualCLIPLoader"])
+NODE_CLASS_MAPPINGS["CLIPLoaderDisTorch2MultiGPU"] = override_class_with_distorch_safetensor_v2_clip(GLOBAL_NODE_CLASS_MAPPINGS["CLIPLoader"])
+NODE_CLASS_MAPPINGS["DualCLIPLoaderDisTorch2MultiGPU"] = override_class_with_distorch_safetensor_v2_clip(GLOBAL_NODE_CLASS_MAPPINGS["DualCLIPLoader"])
 if "TripleCLIPLoader" in GLOBAL_NODE_CLASS_MAPPINGS:
-    NODE_CLASS_MAPPINGS["TripleCLIPLoaderDisTorch2MultiGPU"] = override_class_with_distorch_safetensor_v2(GLOBAL_NODE_CLASS_MAPPINGS["TripleCLIPLoader"])
+    NODE_CLASS_MAPPINGS["TripleCLIPLoaderDisTorch2MultiGPU"] = override_class_with_distorch_safetensor_v2_clip(GLOBAL_NODE_CLASS_MAPPINGS["TripleCLIPLoader"])
 if "QuadrupleCLIPLoader" in GLOBAL_NODE_CLASS_MAPPINGS:
-    NODE_CLASS_MAPPINGS["QuadrupleCLIPLoaderDisTorch2MultiGPU"] = override_class_with_distorch_safetensor_v2(GLOBAL_NODE_CLASS_MAPPINGS["QuadrupleCLIPLoader"])
-NODE_CLASS_MAPPINGS["CLIPVisionLoaderDisTorch2MultiGPU"] = override_class_with_distorch_safetensor_v2(GLOBAL_NODE_CLASS_MAPPINGS["CLIPVisionLoader"])
+    NODE_CLASS_MAPPINGS["QuadrupleCLIPLoaderDisTorch2MultiGPU"] = override_class_with_distorch_safetensor_v2_clip(GLOBAL_NODE_CLASS_MAPPINGS["QuadrupleCLIPLoader"])
+NODE_CLASS_MAPPINGS["CLIPVisionLoaderDisTorch2MultiGPU"] = override_class_with_distorch_safetensor_v2_clip(GLOBAL_NODE_CLASS_MAPPINGS["CLIPVisionLoader"])
 NODE_CLASS_MAPPINGS["CheckpointLoaderSimpleDisTorch2MultiGPU"] = override_class_with_distorch_safetensor_v2(GLOBAL_NODE_CLASS_MAPPINGS["CheckpointLoaderSimple"])
 NODE_CLASS_MAPPINGS["ControlNetLoaderDisTorch2MultiGPU"] = override_class_with_distorch_safetensor_v2(GLOBAL_NODE_CLASS_MAPPINGS["ControlNetLoader"])
 if "DiffusersLoader" in GLOBAL_NODE_CLASS_MAPPINGS:
@@ -307,10 +309,10 @@ def register_and_count(module_names, node_map):
     "QuadrupleCLIPLoaderGGUFDisTorchMultiGPU": override_class_with_distorch_clip(QuadrupleCLIPLoaderGGUF),
     "UnetLoaderGGUFDisTorch2MultiGPU": override_class_with_distorch_safetensor_v2(UnetLoaderGGUF),
     "UnetLoaderGGUFAdvancedDisTorch2MultiGPU": override_class_with_distorch_safetensor_v2(UnetLoaderGGUFAdvanced),
-    "CLIPLoaderGGUFDisTorch2MultiGPU": override_class_with_distorch_safetensor_v2(CLIPLoaderGGUF),
-    "DualCLIPLoaderGGUFDisTorch2MultiGPU": override_class_with_distorch_safetensor_v2(DualCLIPLoaderGGUF),
-    "TripleCLIPLoaderGGUFDisTorch2MultiGPU": override_class_with_distorch_safetensor_v2(TripleCLIPLoaderGGUF),
-    "QuadrupleCLIPLoaderGGUFDisTorch2MultiGPU": override_class_with_distorch_safetensor_v2(QuadrupleCLIPLoaderGGUF),
+    "CLIPLoaderGGUFDisTorch2MultiGPU": override_class_with_distorch_safetensor_v2_clip(CLIPLoaderGGUF),
+    "DualCLIPLoaderGGUFDisTorch2MultiGPU": override_class_with_distorch_safetensor_v2_clip(DualCLIPLoaderGGUF),
+    "TripleCLIPLoaderGGUFDisTorch2MultiGPU": override_class_with_distorch_safetensor_v2_clip(TripleCLIPLoaderGGUF),
+    "QuadrupleCLIPLoaderGGUFDisTorch2MultiGPU": override_class_with_distorch_safetensor_v2_clip(QuadrupleCLIPLoaderGGUF),
     "UnetLoaderGGUFMultiGPU": override_class(UnetLoaderGGUF),
     "UnetLoaderGGUFAdvancedMultiGPU": override_class(UnetLoaderGGUFAdvanced),
     "CLIPLoaderGGUFMultiGPU": override_class_clip(CLIPLoaderGGUF),
diff --git a/checkpoint_multigpu.py b/checkpoint_multigpu.py
@@ -12,7 +12,7 @@
 import comfy.model_detection
 import comfy.clip_vision
 from comfy.sd import VAE, CLIP
-from .device_utils import get_device_list
+from .device_utils import get_device_list, soft_empty_cache_multigpu
 from .distorch_2 import safetensor_allocation_store, safetensor_settings_store, create_safetensor_model_hash, register_patched_safetensor_modelpatcher
 
 logger = logging.getLogger("MultiGPU")
@@ -107,8 +107,9 @@ def patched_load_state_dict_guess_config(sd, output_vae=True, output_clip=True,
 
             model = model_config.get_model(sd, diffusion_model_prefix, device=inital_load_device)
 
+            soft_empty_cache_multigpu(logger)
             model_patcher = comfy.model_patcher.ModelPatcher(model, load_device=unet_compute_device, offload_device=mm.unet_offload_device())
-            
+
             if distorch_config and 'unet_allocation' in distorch_config:
                 register_patched_safetensor_modelpatcher()
                 model_hash = create_safetensor_model_hash(model_patcher, "checkpoint_loader_unet")
@@ -136,6 +137,7 @@ def patched_load_state_dict_guess_config(sd, output_vae=True, output_clip=True,
             if clip_target is not None:
                 clip_sd = model_config.process_clip_state_dict(sd)
                 if len(clip_sd) > 0:
+                    soft_empty_cache_multigpu(logger)
                     clip_params = comfy.utils.calculate_parameters(clip_sd)
                     clip = CLIP(clip_target, embedding_directory=embedding_directory, tokenizer_data=clip_sd, parameters=clip_params, model_options=te_model_options)
 
diff --git a/device_utils.py b/device_utils.py
@@ -45,9 +45,9 @@ def get_device_list():
         if hasattr(torch, "cuda") and hasattr(torch.cuda, "is_available") and torch.cuda.is_available():
             device_count = torch.cuda.device_count()
             devs += [f"cuda:{i}" for i in range(device_count)]
-            logger.debug(f"[MultiGPU] Found {device_count} CUDA device(s)")
+            logger.debug(f"[MultiGPU_Device_Utils] Found {device_count} CUDA device(s)")
     except Exception as e:
-        logger.debug(f"[MultiGPU] CUDA detection failed: {e}")
+        logger.debug(f"[MultiGPU_Device_Utils] CUDA detection failed: {e}")
     
     # XPU devices (Intel GPUs)
     try:
@@ -59,47 +59,47 @@ def get_device_list():
         if hasattr(torch, "xpu") and hasattr(torch.xpu, "is_available") and torch.xpu.is_available():
             device_count = torch.xpu.device_count()
             devs += [f"xpu:{i}" for i in range(device_count)]
-            logger.debug(f"[MultiGPU] Found {device_count} XPU device(s)")
+            logger.debug(f"[MultiGPU_Device_Utils] Found {device_count} XPU device(s)")
     except Exception as e:
-        logger.debug(f"[MultiGPU] XPU detection failed: {e}")
+        logger.debug(f"[MultiGPU_Device_Utils] XPU detection failed: {e}")
     
     # NPU devices (Ascend NPUs from Huawei)
     try:
         import torch_npu
         if hasattr(torch, "npu") and hasattr(torch.npu, "is_available") and torch.npu.is_available():
             device_count = torch.npu.device_count()
             devs += [f"npu:{i}" for i in range(device_count)]
-            logger.debug(f"[MultiGPU] Found {device_count} NPU device(s)")
+            logger.debug(f"[MultiGPU_Device_Utils] Found {device_count} NPU device(s)")
     except Exception as e:
-        logger.debug(f"[MultiGPU] NPU detection failed: {e}")
+        logger.debug(f"[MultiGPU_Device_Utils] NPU detection failed: {e}")
     
     # MLU devices (Cambricon MLUs)
     try:
         import torch_mlu
         if hasattr(torch, "mlu") and hasattr(torch.mlu, "is_available") and torch.mlu.is_available():
             device_count = torch.mlu.device_count()
             devs += [f"mlu:{i}" for i in range(device_count)]
-            logger.debug(f"[MultiGPU] Found {device_count} MLU device(s)")
+            logger.debug(f"[MultiGPU_Device_Utils] Found {device_count} MLU device(s)")
     except Exception as e:
-        logger.debug(f"[MultiGPU] MLU detection failed: {e}")
+        logger.debug(f"[MultiGPU_Device_Utils] MLU detection failed: {e}")
     
     # MPS device (Apple Metal - single device only)
     try:
         if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
             devs.append("mps")
-            logger.debug("[MultiGPU] Found MPS device")
+            logger.debug("[MultiGPU_Device_Utils] Found MPS device")
     except Exception as e:
-        logger.debug(f"[MultiGPU] MPS detection failed: {e}")
+        logger.debug(f"[MultiGPU_Device_Utils] MPS detection failed: {e}")
     
     # DirectML devices (Windows DirectML for AMD/Intel/NVIDIA)
     try:
         import torch_directml
         adapter_count = torch_directml.device_count()
         if adapter_count > 0:
             devs += [f"directml:{i}" for i in range(adapter_count)]
-            logger.debug(f"[MultiGPU] Found {adapter_count} DirectML adapter(s)")
+            logger.debug(f"[MultiGPU_Device_Utils] Found {adapter_count} DirectML adapter(s)")
     except Exception as e:
-        logger.debug(f"[MultiGPU] DirectML detection failed: {e}")
+        logger.debug(f"[MultiGPU_Device_Utils] DirectML detection failed: {e}")
     
     # IXUCA/CoreX devices (special accelerator)
     try:
@@ -108,18 +108,18 @@ def get_device_list():
             if hasattr(torch.corex, "device_count"):
                 device_count = torch.corex.device_count()
                 devs += [f"corex:{i}" for i in range(device_count)]
-                logger.debug(f"[MultiGPU] Found {device_count} CoreX device(s)")
+                logger.debug(f"[MultiGPU_Device_Utils] Found {device_count} CoreX device(s)")
             else:
                 devs.append("corex:0")
-                logger.debug("[MultiGPU] Found CoreX device")
+                logger.debug("[MultiGPU_Device_Utils] Found CoreX device")
     except Exception as e:
-        logger.debug(f"[MultiGPU] CoreX detection failed: {e}")
+        logger.debug(f"[MultiGPU_Device_Utils] CoreX detection failed: {e}")
     
     # Cache the result for future calls
     _DEVICE_LIST_CACHE = devs
     
     # Log only once when initially populated
-    logger.info(f"[MultiGPU] Device list initialized: {devs}")
+    logger.info(f"[MultiGPU_Device_Utils] Device list initialized: {devs}")
     
     return devs
 
@@ -218,14 +218,54 @@ def get_device_type(device_string):
 def parse_device_string(device_string):
     """
     Parse a device string into type and index.
-    
+
     Args:
         device_string: Device identifier like "cuda:0", "cpu", "xpu:1", etc.
-    
+
     Returns:
         Tuple of (device_type, device_index) where index is None for non-indexed devices
     """
     if ":" in device_string:
         parts = device_string.split(":")
         return parts[0], int(parts[1])
     return device_string, None
+
+
+def soft_empty_cache_multigpu(logger):
+    """
+    Replicate ComfyUI's cache clearing but for ALL devices in MultiGPU.
+    MultiGPU adaptation of ComfyUI's soft_empty_cache() functionality.
+    """
+    import gc
+
+    logger.info("[MultiGPU_Device_Utils] Preparing devices for optimized safetensor loading")
+
+    # Python GC (same as all implementations)
+    gc.collect()
+    logger.debug("[MultiGPU_Device_Utils] Performed garbage collection before safetensor loading")
+
+    # Clear cache for ALL devices (not just ComfyUI's single device)
+    all_devices = get_device_list()
+
+    for device_str in all_devices:
+        if device_str.startswith("cuda:"):
+            device_idx = int(device_str.split(":")[1])
+            torch.cuda.set_device(device_idx)
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()  # ComfyUI's CUDA optimization
+            logger.debug(f"[MultiGPU_Device_Utils] Cleared cache + IPC for {device_str}")
+        elif device_str == "mps":
+            torch.mps.empty_cache()
+            logger.debug("[MultiGPU_Device_Utils] Cleared cache for MPS")
+        elif device_str.startswith("xpu:"):
+            torch.xpu.empty_cache()
+            logger.debug("[MultiGPU_Device_Utils] Cleared cache for Intel XPU")
+        elif device_str.startswith("npu:"):
+            torch.npu.empty_cache()
+            logger.debug("[MultiGPU_Device_Utils] Cleared cache for Ascend NPU")
+        elif device_str.startswith("mlu:"):
+            torch.mlu.empty_cache()
+            logger.debug("[MultiGPU_Device_Utils] Cleared cache for Cambricon MLU")
+        elif device_str.startswith("corex:"):
+            torch.corex.empty_cache()  # Hypothetical based on ComfyUI's ixuca support
+            logger.debug("[MultiGPU_Device_Utils] Cleared cache for CoreX")
diff --git a/distorch.py b/distorch.py
@@ -12,7 +12,7 @@
 import copy
 from collections import defaultdict
 import comfy.model_management as mm
-from .device_utils import get_device_list
+from .device_utils import get_device_list, soft_empty_cache_multigpu
 
 # Global store for model allocations
 model_allocation_store = {}
@@ -62,6 +62,7 @@ def new_load(self, *args, force_patch_weights=False, **kwargs):
                 debug_hash = create_model_hash(self, "patcher")
                 debug_allocations = model_allocation_store.get(debug_hash)
                 if debug_allocations:
+                    soft_empty_cache_multigpu(logger)
                     device_assignments = analyze_ggml_loading(self.model, debug_allocations)['device_assignments']
                     for device, layers in device_assignments.items():
                         target_device = torch.device(device)
diff --git a/distorch_2.py b/distorch_2.py