pdate Distorch for ComfyUI 0.6.0+ load list with device assignment caching.

pollockjj · pollockjj · commit 03e5368f8e0a · 2026-01-02T22:20:55.000-06:00
diff --git a/checkpoint_multigpu.py b/checkpoint_multigpu.py
@@ -62,6 +62,11 @@ def patched_load_state_dict_guess_config(sd, output_vae=True, output_clip=True,
         diffusion_model_prefix = comfy.model_detection.unet_prefix_from_state_dict(sd)
         parameters = comfy.utils.calculate_parameters(sd, diffusion_model_prefix)
         weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix)
+
+        custom_operations = model_options.get("custom_operations", None)
+        if custom_operations is None:
+            sd, metadata = comfy.utils.convert_old_quants(sd, diffusion_model_prefix, metadata=metadata)
+
         model_config = comfy.model_detection.model_config_from_unet(sd, diffusion_model_prefix, metadata=metadata)
         
         if model_config is None:
@@ -79,13 +84,17 @@ def patched_load_state_dict_guess_config(sd, output_vae=True, output_clip=True,
         if model_config.scaled_fp8 is not None:
             weight_dtype = None
         
-        model_config.custom_operations = model_options.get("custom_operations", None)
+        if custom_operations is not None:
+            model_config.custom_operations = custom_operations
         unet_dtype = model_options.get("dtype", model_options.get("weight_dtype", None))
         if unet_dtype is None:
             unet_dtype = mm.unet_dtype(model_params=parameters, supported_dtypes=unet_weight_dtype, weight_dtype=weight_dtype)
         
         unet_compute_device = device_config.get('unet_device', original_main_device)
-        manual_cast_dtype = mm.unet_manual_cast(unet_dtype, torch.device(unet_compute_device), model_config.supported_inference_dtypes)
+        if model_config.scaled_fp8 is not None:
+            manual_cast_dtype = mm.unet_manual_cast(None, torch.device(unet_compute_device), model_config.supported_inference_dtypes)
+        else:
+            manual_cast_dtype = mm.unet_manual_cast(unet_dtype, torch.device(unet_compute_device), model_config.supported_inference_dtypes)
         model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)
         logger.info(f"UNet DType: {unet_dtype}, Manual Cast: {manual_cast_dtype}")
 
@@ -101,6 +110,8 @@ def patched_load_state_dict_guess_config(sd, output_vae=True, output_clip=True,
             multigpu_memory_log(f"unet:{config_hash[:8]}", "pre-load")
 
             model = model_config.get_model(sd, diffusion_model_prefix, device=inital_load_device)
+            model.load_model_weights(sd, diffusion_model_prefix)
+            multigpu_memory_log(f"unet:{config_hash[:8]}", "post-weights")
 
             logger.mgpu_mm_log("Invoking soft_empty_cache_multigpu before UNet ModelPatcher setup")
             soft_empty_cache_multigpu()
@@ -116,9 +127,6 @@ def patched_load_state_dict_guess_config(sd, output_vae=True, output_clip=True,
                     logger.info(f"[CHECKPOINT_META] UNET inner_model id=0x{id(inner_model):x}")
                     model._distorch_high_precision_loras = distorch_config.get('high_precision_loras', True)
 
-            model.load_model_weights(sd, diffusion_model_prefix)
-            multigpu_memory_log(f"unet:{config_hash[:8]}", "post-weights")
-
         if output_vae:
             vae_target_device = torch.device(device_config.get('vae_device', original_main_device))
             set_current_device(vae_target_device) # Use main device context for VAE
@@ -130,6 +138,27 @@ def patched_load_state_dict_guess_config(sd, output_vae=True, output_clip=True,
             multigpu_memory_log(f"vae:{config_hash[:8]}", "post-load")
 
         if output_clip:
+            if te_model_options.get("custom_operations", None) is None:
+                scaled_fp8_list = []
+                for k in list(sd.keys()):  # Convert scaled fp8 to mixed ops
+                    if k.endswith(".scaled_fp8"):
+                        scaled_fp8_list.append(k[:-len("scaled_fp8")])
+
+                if len(scaled_fp8_list) > 0:
+                    out_sd = {}
+                    for k in sd:
+                        skip = False
+                        for pref in scaled_fp8_list:
+                            skip = skip or k.startswith(pref)
+                        if not skip:
+                           out_sd[k] = sd[k]
+
+                    for pref in scaled_fp8_list:
+                        quant_sd, qmetadata = comfy.utils.convert_old_quants(sd, pref, metadata={})
+                        for k in quant_sd:
+                            out_sd[k] = quant_sd[k]
+                        sd = out_sd
+
             clip_target_device = device_config.get('clip_device', original_clip_device)
             set_current_text_encoder_device(clip_target_device)
             
@@ -224,15 +253,16 @@ def INPUT_TYPES(s):
                 "ckpt_name": (folder_paths.get_filename_list("checkpoints"), ),
                 "unet_compute_device": (devices, {"default": compute_device}),
                 "unet_virtual_vram_gb": ("FLOAT", {"default": 4.0, "min": 0.0, "max": 128.0, "step": 0.1}),
-                "unet_donor_device": ("STRING", {"default": "cpu"}),
+                "unet_donor_device": (devices, {"default": "cpu"}),
                 "clip_compute_device": (devices, {"default": "cpu"}),
                 "clip_virtual_vram_gb": ("FLOAT", {"default": 2.0, "min": 0.0, "max": 128.0, "step": 0.1}),
-                "clip_donor_device": ("STRING", {"default": "cpu"}),
+                "clip_donor_device": (devices, {"default": "cpu"}),
                 "vae_device": (devices, {"default": compute_device}),
             }, "optional": {
                 "unet_expert_mode_allocations": ("STRING", {"multiline": False, "default": ""}),
                 "clip_expert_mode_allocations": ("STRING", {"multiline": False, "default": ""}),
                 "high_precision_loras": ("BOOLEAN", {"default": True}),
+                "eject_models": ("BOOLEAN", {"default": True}),
             }
         }
     
@@ -243,7 +273,22 @@ def INPUT_TYPES(s):
     
     def load_checkpoint(self, ckpt_name, unet_compute_device, unet_virtual_vram_gb, unet_donor_device,
                        clip_compute_device, clip_virtual_vram_gb, clip_donor_device, vae_device,
-                       unet_expert_mode_allocations="", clip_expert_mode_allocations="", high_precision_loras=True):
+                       unet_expert_mode_allocations="", clip_expert_mode_allocations="", high_precision_loras=True, eject_models=True):
+        
+        if eject_models:
+            logger.mgpu_mm_log(f"[EJECT_MODELS_SETUP] eject_models=True - marking all loaded models for eviction")
+            ejection_count = 0
+            for i, lm in enumerate(mm.current_loaded_models):
+                model_name = type(getattr(lm.model, 'model', lm.model)).__name__ if lm.model else 'Unknown'
+                if hasattr(lm.model, 'model') and lm.model.model is not None:
+                    lm.model.model._mgpu_unload_distorch_model = True
+                    logger.mgpu_mm_log(f"[EJECT_MARKED] Model {i}: {model_name} (id=0x{id(lm):x}) → marked for eviction")
+                    ejection_count += 1
+                elif lm.model is not None:
+                    lm.model._mgpu_unload_distorch_model = True
+                    logger.mgpu_mm_log(f"[EJECT_MARKED] Model {i}: {model_name} (direct patcher) → marked for eviction")
+                    ejection_count += 1
+            logger.mgpu_mm_log(f"[EJECT_MODELS_SETUP_COMPLETE] Marked {ejection_count} models for Comfy Core eviction during load_models_gpu")
         
         patch_load_state_dict_guess_config()        
         
diff --git a/distorch_2.py b/distorch_2.py
@@ -20,6 +20,20 @@
 from .model_management_mgpu import multigpu_memory_log, force_full_system_cleanup
 
 
+
+def unpack_load_item(item):
+    """Handle ComfyUI 0.6.0+ 5-tuple vs legacy 4-tuple"""
+    if len(item) == 5:
+        # (module_offload_mem, module_mem, module_name, module_object, params)
+        return item[1], item[2], item[3], item[4]
+    # (module_mem, module_name, module_object, params)
+    return item[0], item[1], item[2], item[3]
+
+
+
+
+
+
 def register_patched_safetensor_modelpatcher():
     """Register and patch the ModelPatcher for distributed safetensor loading"""
     from comfy.model_patcher import wipe_lowvram_weight, move_weight_functions
@@ -236,13 +250,26 @@ def new_partially_load(self, device_to, extra_memory=0, full_load=False, force_p
             mem_counter = 0
 
             is_clip_model = getattr(self, 'is_clip', False)
-            device_assignments = analyze_safetensor_loading(self, allocations, is_clip=is_clip_model)
+            ## TODO - I do not believe this code is needed and needs to be flagged for proof it is needed
+            # Check for valid cache
+            allocations_match = hasattr(self, '_distorch_last_allocations') and self._distorch_last_allocations == allocations
+            cache_exists = hasattr(self, '_distorch_cached_assignments')
+            
+            if cache_exists and allocations_match and not unpatch_weights and not force_patch_weights:
+                device_assignments = self._distorch_cached_assignments
+                logger.debug(f"[MultiGPU DisTorch V2] Reusing cached analysis for {type(inner_model).__name__}")
+            else:
+                device_assignments = analyze_safetensor_loading(self, allocations, is_clip=is_clip_model)  ## This should be the only required line - that is how it worked previous release so if it doesn't it is Comfy changes
+                self._distorch_cached_assignments = device_assignments
+                self._distorch_last_allocations = allocations
             
             model_original_dtype = comfy.utils.weight_dtype(self.model.state_dict())
             high_precision_loras = getattr(self.model, "_distorch_high_precision_loras", True)
+            # Use standard ComfyUI load list - the device comparison fix ensures we don't crash
             loading = self._load_list()
             loading.sort(reverse=True)
-            for module_size, module_name, module_object, params in loading:
+            for item in loading:
+                module_size, module_name, module_object, params = unpack_load_item(item)
                 if not unpatch_weights and hasattr(module_object, "comfy_patched_weights") and module_object.comfy_patched_weights == True:
                     block_target_device = device_assignments['block_assignments'].get(module_name, device_to)
                     current_module_device = None
@@ -290,7 +317,7 @@ def new_partially_load(self, device_to, extra_memory=0, full_load=False, force_p
                             logger.debug(f"[MultiGPU DisTorch V2] Cast {module_name}.{param_name} to FP8 for CPU storage")
 
                 # Step 4: Move to ultimate destination based on DisTorch assignment
-                if block_target_device != device_to:
+                if str(block_target_device) != str(device_to):
                     logger.debug(f"[MultiGPU DisTorch V2] Moving {module_name} from {device_to} to {block_target_device}")
                     module_object.to(block_target_device)
                     module_object.comfy_cast_weights = True
@@ -321,7 +348,10 @@ def _extract_clip_head_blocks(raw_block_list, compute_device):
     head_memory = 0
     block_assignments = {}
     
-    for module_size, module_name, module_object, params in raw_block_list:
+    block_assignments = {}
+    
+    for item in raw_block_list:
+        module_size, module_name, module_object, params = unpack_load_item(item)
         if any(kw in module_name.lower() for kw in head_keywords):
             head_blocks.append((module_size, module_name, module_object, params))
             block_assignments[module_name] = compute_device
@@ -423,7 +453,7 @@ def analyze_safetensor_loading(model_patcher, allocations_string, is_clip=False)
     total_memory = 0
 
     raw_block_list = model_patcher._load_list()
-    total_memory = sum(module_size for module_size, _, _, _ in raw_block_list)
+    total_memory = sum(unpack_load_item(x)[0] for x in raw_block_list)
 
     MIN_BLOCK_THRESHOLD = total_memory * 0.0001
     logger.debug(f"[MultiGPU DisTorch V2] Total model memory: {total_memory} bytes")
@@ -441,7 +471,8 @@ def analyze_safetensor_loading(model_patcher, allocations_string, is_clip=False)
 
     # Build all_blocks list for summary (using full raw_block_list)
     all_blocks = []
-    for module_size, module_name, module_object, params in raw_block_list:
+    for item in raw_block_list:
+        module_size, module_name, module_object, params = unpack_load_item(item)
         block_type = type(module_object).__name__
         # Populate summary dictionaries
         block_summary[block_type] = block_summary.get(block_type, 0) + 1
@@ -450,11 +481,12 @@ def analyze_safetensor_loading(model_patcher, allocations_string, is_clip=False)
 
     # Use distributable blocks for actual allocation (for CLIP, this excludes heads)
     distributable_all_blocks = []
-    for module_size, module_name, module_object, params in distributable_raw:
+    for item in distributable_raw:
+        module_size, module_name, module_object, params = unpack_load_item(item)
         distributable_all_blocks.append((module_name, module_object, type(module_object).__name__, module_size))
 
-    block_list = [b for b in distributable_all_blocks if b[3] >= MIN_BLOCK_THRESHOLD]
-    tiny_block_list = [b for b in distributable_all_blocks if b[3] < MIN_BLOCK_THRESHOLD]
+    block_list = [b for b in distributable_all_blocks if (b[3] >= MIN_BLOCK_THRESHOLD and hasattr(b[1], "bias"))] 
+    tiny_block_list = [b for b in distributable_all_blocks if b not in block_list]
     
     logger.debug(f"[MultiGPU DisTorch V2] Total blocks: {len(all_blocks)}")
     logger.debug(f"[MultiGPU DisTorch V2] Distributable blocks: {len(block_list)}")
@@ -476,8 +508,6 @@ def analyze_safetensor_loading(model_patcher, allocations_string, is_clip=False)
     # Distribute blocks sequentially from the tail of the model
 
     device_assignments = {device: [] for device in DEVICE_RATIOS_DISTORCH.keys()}
-    block_assignments = {}
-
     # Create a memory quota for each donor device based on its calculated allocation.
     donor_devices = [d for d in sorted_devices]
     donor_quotas = {
@@ -581,7 +611,7 @@ def parse_memory_string(mem_str):
 def calculate_fraction_from_byte_expert_string(model_patcher, byte_str):
     """Convert byte allocation string (e.g. 'cuda:1,4gb;cpu,*') to fractional VRAM allocation string respecting device order and byte quotas."""
     raw_block_list = model_patcher._load_list()
-    total_model_memory = sum(module_size for module_size, _, _, _ in raw_block_list)
+    total_model_memory = sum(unpack_load_item(x)[0] for x in raw_block_list)
     remaining_model_bytes = total_model_memory
 
     # Use a list of tuples to preserve the user-defined order
@@ -640,7 +670,7 @@ def calculate_fraction_from_byte_expert_string(model_patcher, byte_str):
 def calculate_fraction_from_ratio_expert_string(model_patcher, ratio_str):
     """Convert ratio allocation string (e.g. 'cuda:0,25%;cpu,75%') describing model split to fractional VRAM allocation string."""
     raw_block_list = model_patcher._load_list()
-    total_model_memory = sum(module_size for module_size, _, _, _ in raw_block_list)
+    total_model_memory = sum(unpack_load_item(x)[0] for x in raw_block_list)
 
     raw_ratios = {}
     for allocation in ratio_str.split(';'):