Merge pull request #161 from pollockjj/tmv

pollockjj · web-flow · commit ac3df4ed701d · 2026-01-02T22:55:42.000-06:00
Fix DisTorch Engine for ComfyUI 0.6.0+
diff --git a/checkpoint_multigpu.py b/checkpoint_multigpu.py
@@ -62,6 +62,11 @@ def patched_load_state_dict_guess_config(sd, output_vae=True, output_clip=True,
         diffusion_model_prefix = comfy.model_detection.unet_prefix_from_state_dict(sd)
         parameters = comfy.utils.calculate_parameters(sd, diffusion_model_prefix)
         weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix)
+
+        custom_operations = model_options.get("custom_operations", None)
+        if custom_operations is None:
+            sd, metadata = comfy.utils.convert_old_quants(sd, diffusion_model_prefix, metadata=metadata)
+
         model_config = comfy.model_detection.model_config_from_unet(sd, diffusion_model_prefix, metadata=metadata)
         
         if model_config is None:
@@ -79,13 +84,17 @@ def patched_load_state_dict_guess_config(sd, output_vae=True, output_clip=True,
         if model_config.scaled_fp8 is not None:
             weight_dtype = None
         
-        model_config.custom_operations = model_options.get("custom_operations", None)
+        if custom_operations is not None:
+            model_config.custom_operations = custom_operations
         unet_dtype = model_options.get("dtype", model_options.get("weight_dtype", None))
         if unet_dtype is None:
             unet_dtype = mm.unet_dtype(model_params=parameters, supported_dtypes=unet_weight_dtype, weight_dtype=weight_dtype)
         
         unet_compute_device = device_config.get('unet_device', original_main_device)
-        manual_cast_dtype = mm.unet_manual_cast(unet_dtype, torch.device(unet_compute_device), model_config.supported_inference_dtypes)
+        if model_config.scaled_fp8 is not None:
+            manual_cast_dtype = mm.unet_manual_cast(None, torch.device(unet_compute_device), model_config.supported_inference_dtypes)
+        else:
+            manual_cast_dtype = mm.unet_manual_cast(unet_dtype, torch.device(unet_compute_device), model_config.supported_inference_dtypes)
         model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)
         logger.info(f"UNet DType: {unet_dtype}, Manual Cast: {manual_cast_dtype}")
 
@@ -101,6 +110,8 @@ def patched_load_state_dict_guess_config(sd, output_vae=True, output_clip=True,
             multigpu_memory_log(f"unet:{config_hash[:8]}", "pre-load")
 
             model = model_config.get_model(sd, diffusion_model_prefix, device=inital_load_device)
+            model.load_model_weights(sd, diffusion_model_prefix)
+            multigpu_memory_log(f"unet:{config_hash[:8]}", "post-weights")
 
             logger.mgpu_mm_log("Invoking soft_empty_cache_multigpu before UNet ModelPatcher setup")
             soft_empty_cache_multigpu()
@@ -116,9 +127,6 @@ def patched_load_state_dict_guess_config(sd, output_vae=True, output_clip=True,
                     logger.info(f"[CHECKPOINT_META] UNET inner_model id=0x{id(inner_model):x}")
                     model._distorch_high_precision_loras = distorch_config.get('high_precision_loras', True)
 
-            model.load_model_weights(sd, diffusion_model_prefix)
-            multigpu_memory_log(f"unet:{config_hash[:8]}", "post-weights")
-
         if output_vae:
             vae_target_device = torch.device(device_config.get('vae_device', original_main_device))
             set_current_device(vae_target_device) # Use main device context for VAE
@@ -130,6 +138,27 @@ def patched_load_state_dict_guess_config(sd, output_vae=True, output_clip=True,
             multigpu_memory_log(f"vae:{config_hash[:8]}", "post-load")
 
         if output_clip:
+            if te_model_options.get("custom_operations", None) is None:
+                scaled_fp8_list = []
+                for k in list(sd.keys()):  # Convert scaled fp8 to mixed ops
+                    if k.endswith(".scaled_fp8"):
+                        scaled_fp8_list.append(k[:-len("scaled_fp8")])
+
+                if len(scaled_fp8_list) > 0:
+                    out_sd = {}
+                    for k in sd:
+                        skip = False
+                        for pref in scaled_fp8_list:
+                            skip = skip or k.startswith(pref)
+                        if not skip:
+                           out_sd[k] = sd[k]
+
+                    for pref in scaled_fp8_list:
+                        quant_sd, qmetadata = comfy.utils.convert_old_quants(sd, pref, metadata={})
+                        for k in quant_sd:
+                            out_sd[k] = quant_sd[k]
+                        sd = out_sd
+
             clip_target_device = device_config.get('clip_device', original_clip_device)
             set_current_text_encoder_device(clip_target_device)
             
@@ -224,15 +253,16 @@ def INPUT_TYPES(s):
                 "ckpt_name": (folder_paths.get_filename_list("checkpoints"), ),
                 "unet_compute_device": (devices, {"default": compute_device}),
                 "unet_virtual_vram_gb": ("FLOAT", {"default": 4.0, "min": 0.0, "max": 128.0, "step": 0.1}),
-                "unet_donor_device": ("STRING", {"default": "cpu"}),
+                "unet_donor_device": (devices, {"default": "cpu"}),
                 "clip_compute_device": (devices, {"default": "cpu"}),
                 "clip_virtual_vram_gb": ("FLOAT", {"default": 2.0, "min": 0.0, "max": 128.0, "step": 0.1}),
-                "clip_donor_device": ("STRING", {"default": "cpu"}),
+                "clip_donor_device": (devices, {"default": "cpu"}),
                 "vae_device": (devices, {"default": compute_device}),
             }, "optional": {
                 "unet_expert_mode_allocations": ("STRING", {"multiline": False, "default": ""}),
                 "clip_expert_mode_allocations": ("STRING", {"multiline": False, "default": ""}),
                 "high_precision_loras": ("BOOLEAN", {"default": True}),
+                "eject_models": ("BOOLEAN", {"default": True}),
             }
         }
     
@@ -243,7 +273,22 @@ def INPUT_TYPES(s):
     
     def load_checkpoint(self, ckpt_name, unet_compute_device, unet_virtual_vram_gb, unet_donor_device,
                        clip_compute_device, clip_virtual_vram_gb, clip_donor_device, vae_device,
-                       unet_expert_mode_allocations="", clip_expert_mode_allocations="", high_precision_loras=True):
+                       unet_expert_mode_allocations="", clip_expert_mode_allocations="", high_precision_loras=True, eject_models=True):
+        
+        if eject_models:
+            logger.mgpu_mm_log(f"[EJECT_MODELS_SETUP] eject_models=True - marking all loaded models for eviction")
+            ejection_count = 0
+            for i, lm in enumerate(mm.current_loaded_models):
+                model_name = type(getattr(lm.model, 'model', lm.model)).__name__ if lm.model else 'Unknown'
+                if hasattr(lm.model, 'model') and lm.model.model is not None:
+                    lm.model.model._mgpu_unload_distorch_model = True
+                    logger.mgpu_mm_log(f"[EJECT_MARKED] Model {i}: {model_name} (id=0x{id(lm):x}) → marked for eviction")
+                    ejection_count += 1
+                elif lm.model is not None:
+                    lm.model._mgpu_unload_distorch_model = True
+                    logger.mgpu_mm_log(f"[EJECT_MARKED] Model {i}: {model_name} (direct patcher) → marked for eviction")
+                    ejection_count += 1
+            logger.mgpu_mm_log(f"[EJECT_MODELS_SETUP_COMPLETE] Marked {ejection_count} models for Comfy Core eviction during load_models_gpu")
         
         patch_load_state_dict_guess_config()        
         
diff --git a/distorch_2.py b/distorch_2.py
@@ -20,6 +20,20 @@
 from .model_management_mgpu import multigpu_memory_log, force_full_system_cleanup
 
 
+
+def unpack_load_item(item):
+    """Handle ComfyUI 0.6.0+ 5-tuple vs legacy 4-tuple"""
+    if len(item) == 5:
+        # (module_offload_mem, module_mem, module_name, module_object, params)
+        return item[1], item[2], item[3], item[4]
+    # (module_mem, module_name, module_object, params)
+    return item[0], item[1], item[2], item[3]
+
+
+
+
+
+
 def register_patched_safetensor_modelpatcher():
     """Register and patch the ModelPatcher for distributed safetensor loading"""
     from comfy.model_patcher import wipe_lowvram_weight, move_weight_functions
@@ -53,7 +67,7 @@ def patched_load_models_gpu(models, memory_required=0, force_patch_weights=False
                 models_temp.add(m)
                 model_type = type(m).__name__
 
-                if ("GGUF" in model_type or "ModelPatcher" in model_type) and hasattr(m, "model_patches_to"):
+                if ("GGUF" in model_type or "ModelPatcher" in model_type) and hasattr(m, "model_patches_to") and not hasattr(m, "model_patches_models"):
                     logger.info(f"[MultiGPU DisTorch V2] {type(m).__name__} missing 'model_patches_models' attribute, using 'model_patches_to' fallback.")
                     target_device = m.load_device
                     logger.debug(f"[MultiGPU DisTorch V2] Target device: {target_device}")
@@ -236,13 +250,26 @@ def new_partially_load(self, device_to, extra_memory=0, full_load=False, force_p
             mem_counter = 0
 
             is_clip_model = getattr(self, 'is_clip', False)
-            device_assignments = analyze_safetensor_loading(self, allocations, is_clip=is_clip_model)
+            ## TODO - I do not believe this code is needed and needs to be flagged for proof it is needed
+            # Check for valid cache
+            allocations_match = hasattr(self, '_distorch_last_allocations') and self._distorch_last_allocations == allocations
+            cache_exists = hasattr(self, '_distorch_cached_assignments')
+            
+            if cache_exists and allocations_match and not unpatch_weights and not force_patch_weights:
+                device_assignments = self._distorch_cached_assignments
+                logger.debug(f"[MultiGPU DisTorch V2] Reusing cached analysis for {type(inner_model).__name__}")
+            else:
+                device_assignments = analyze_safetensor_loading(self, allocations, is_clip=is_clip_model)  ## This should be the only required line - that is how it worked previous release so if it doesn't it is Comfy changes
+                self._distorch_cached_assignments = device_assignments
+                self._distorch_last_allocations = allocations
             
             model_original_dtype = comfy.utils.weight_dtype(self.model.state_dict())
             high_precision_loras = getattr(self.model, "_distorch_high_precision_loras", True)
+            # Use standard ComfyUI load list - the device comparison fix ensures we don't crash
             loading = self._load_list()
             loading.sort(reverse=True)
-            for module_size, module_name, module_object, params in loading:
+            for item in loading:
+                module_size, module_name, module_object, params = unpack_load_item(item)
                 if not unpatch_weights and hasattr(module_object, "comfy_patched_weights") and module_object.comfy_patched_weights == True:
                     block_target_device = device_assignments['block_assignments'].get(module_name, device_to)
                     current_module_device = None
@@ -290,7 +317,7 @@ def new_partially_load(self, device_to, extra_memory=0, full_load=False, force_p
                             logger.debug(f"[MultiGPU DisTorch V2] Cast {module_name}.{param_name} to FP8 for CPU storage")
 
                 # Step 4: Move to ultimate destination based on DisTorch assignment
-                if block_target_device != device_to:
+                if str(block_target_device) != str(device_to):
                     logger.debug(f"[MultiGPU DisTorch V2] Moving {module_name} from {device_to} to {block_target_device}")
                     module_object.to(block_target_device)
                     module_object.comfy_cast_weights = True
@@ -321,7 +348,10 @@ def _extract_clip_head_blocks(raw_block_list, compute_device):
     head_memory = 0
     block_assignments = {}
     
-    for module_size, module_name, module_object, params in raw_block_list:
+    block_assignments = {}
+    
+    for item in raw_block_list:
+        module_size, module_name, module_object, params = unpack_load_item(item)
         if any(kw in module_name.lower() for kw in head_keywords):
             head_blocks.append((module_size, module_name, module_object, params))
             block_assignments[module_name] = compute_device
@@ -423,7 +453,7 @@ def analyze_safetensor_loading(model_patcher, allocations_string, is_clip=False)
     total_memory = 0
 
     raw_block_list = model_patcher._load_list()
-    total_memory = sum(module_size for module_size, _, _, _ in raw_block_list)
+    total_memory = sum(unpack_load_item(x)[0] for x in raw_block_list)
 
     MIN_BLOCK_THRESHOLD = total_memory * 0.0001
     logger.debug(f"[MultiGPU DisTorch V2] Total model memory: {total_memory} bytes")
@@ -441,7 +471,8 @@ def analyze_safetensor_loading(model_patcher, allocations_string, is_clip=False)
 
     # Build all_blocks list for summary (using full raw_block_list)
     all_blocks = []
-    for module_size, module_name, module_object, params in raw_block_list:
+    for item in raw_block_list:
+        module_size, module_name, module_object, params = unpack_load_item(item)
         block_type = type(module_object).__name__
         # Populate summary dictionaries
         block_summary[block_type] = block_summary.get(block_type, 0) + 1
@@ -450,11 +481,12 @@ def analyze_safetensor_loading(model_patcher, allocations_string, is_clip=False)
 
     # Use distributable blocks for actual allocation (for CLIP, this excludes heads)
     distributable_all_blocks = []
-    for module_size, module_name, module_object, params in distributable_raw:
+    for item in distributable_raw:
+        module_size, module_name, module_object, params = unpack_load_item(item)
         distributable_all_blocks.append((module_name, module_object, type(module_object).__name__, module_size))
 
-    block_list = [b for b in distributable_all_blocks if b[3] >= MIN_BLOCK_THRESHOLD]
-    tiny_block_list = [b for b in distributable_all_blocks if b[3] < MIN_BLOCK_THRESHOLD]
+    block_list = [b for b in distributable_all_blocks if (b[3] >= MIN_BLOCK_THRESHOLD and hasattr(b[1], "bias"))] 
+    tiny_block_list = [b for b in distributable_all_blocks if b not in block_list]
     
     logger.debug(f"[MultiGPU DisTorch V2] Total blocks: {len(all_blocks)}")
     logger.debug(f"[MultiGPU DisTorch V2] Distributable blocks: {len(block_list)}")
@@ -476,8 +508,6 @@ def analyze_safetensor_loading(model_patcher, allocations_string, is_clip=False)
     # Distribute blocks sequentially from the tail of the model
 
     device_assignments = {device: [] for device in DEVICE_RATIOS_DISTORCH.keys()}
-    block_assignments = {}
-
     # Create a memory quota for each donor device based on its calculated allocation.
     donor_devices = [d for d in sorted_devices]
     donor_quotas = {
@@ -581,7 +611,7 @@ def parse_memory_string(mem_str):
 def calculate_fraction_from_byte_expert_string(model_patcher, byte_str):
     """Convert byte allocation string (e.g. 'cuda:1,4gb;cpu,*') to fractional VRAM allocation string respecting device order and byte quotas."""
     raw_block_list = model_patcher._load_list()
-    total_model_memory = sum(module_size for module_size, _, _, _ in raw_block_list)
+    total_model_memory = sum(unpack_load_item(x)[0] for x in raw_block_list)
     remaining_model_bytes = total_model_memory
 
     # Use a list of tuples to preserve the user-defined order
@@ -640,7 +670,7 @@ def calculate_fraction_from_byte_expert_string(model_patcher, byte_str):
 def calculate_fraction_from_ratio_expert_string(model_patcher, ratio_str):
     """Convert ratio allocation string (e.g. 'cuda:0,25%;cpu,75%') describing model split to fractional VRAM allocation string."""
     raw_block_list = model_patcher._load_list()
-    total_model_memory = sum(module_size for module_size, _, _, _ in raw_block_list)
+    total_model_memory = sum(unpack_load_item(x)[0] for x in raw_block_list)
 
     raw_ratios = {}
     for allocation in ratio_str.split(';'):
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "comfyui-multigpu"
 description = "Provides a suite of custom nodes to manage multiple GPUs for ComfyUI, including advanced model offloading for both GGUF and Safetensor formats with DisTorch, and bespoke MultiGPU support for WanVideoWrapper and other custom nodes."
-version = "2.5.10"
+version = "2.5.11"
 license = {file = "LICENSE"}
 
 [project.urls]
@@ -11,4 +11,4 @@ Repository = "https://github.com/pollockjj/ComfyUI-MultiGPU"
 [tool.comfy]
 PublisherId = "pollockjj"
 DisplayName = "ComfyUI-MultiGPU"
-Icon = "https://raw.githubusercontent.com/pollockjj/ComfyUI-MultiGPU/main/assets/multigpu_icon.png"
+Icon = "https://raw.githubusercontent.com/pollockjj/ComfyUI-MultiGPU/main/assets/multigpu_icon.png"