pollockjj
diff --git a/‎distorch_2.py‎
Lines changed: 137 additions & 84 deletions b/‎distorch_2.py‎
Lines changed: 137 additions & 84 deletions
diff --git a/‎web/docs/CLIPLoaderDisTorch2MultiGPU.md‎
Lines changed: 1 addition & 1 deletion b/‎web/docs/CLIPLoaderDisTorch2MultiGPU.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎web/docs/CLIPLoaderGGUFDisTorch2MultiGPU.md‎
Lines changed: 1 addition & 1 deletion b/‎web/docs/CLIPLoaderGGUFDisTorch2MultiGPU.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎web/docs/CLIPVisionLoaderDisTorch2MultiGPU.md‎
Lines changed: 1 addition & 1 deletion b/‎web/docs/CLIPVisionLoaderDisTorch2MultiGPU.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎web/docs/CheckpointLoaderSimpleDisTorch2MultiGPU.md‎
Lines changed: 1 addition & 1 deletion b/‎web/docs/CheckpointLoaderSimpleDisTorch2MultiGPU.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎web/docs/ControlNetLoaderDisTorch2MultiGPU.md‎
Lines changed: 1 addition & 1 deletion b/‎web/docs/ControlNetLoaderDisTorch2MultiGPU.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎web/docs/DiffControlNetLoaderDisTorch2MultiGPU.md‎
Lines changed: 1 addition & 1 deletion b/‎web/docs/DiffControlNetLoaderDisTorch2MultiGPU.md‎
Lines changed: 1 addition & 1 deletion
@@ -58,96 +58,149 @@ def register_patched_safetensor_modelpatcher():
     # Patch ComfyUI's ModelPatcher
     if not hasattr(comfy.model_patcher.ModelPatcher, '_distorch_patched'):
 
-        # Patch LoadedModel.model_memory_required to drive behavior purely by Phase 2 = unload_distorch_model flag
-        from comfy.model_management import current_loaded_models
 
-        original_loaded_model_memory_required = None
-        for cls in current_loaded_models.__class__.__mro__:
-            if hasattr(cls, 'model_memory_required'):
-                original_loaded_model_memory_required = cls.model_memory_required
-                break
+        # PATCH load_models_gpu with correct memory calculations per model flags
+        original_load_models_gpu = mm.load_models_gpu
 
-        if original_loaded_model_memory_required is None:
-            # Global patch of LoadedModel class if available
-            import comfy.model_management as mm
-
-            original_loaded_model_memory_required = mm.LoadedModel.model_memory_required
-
-            def patched_loaded_model_memory_required(self, device):
-                """Truth table for memory reporting:
-                   eject_models=0, is_distorch=0: return original
-                   eject_models=0, is_distorch=1: return original - virtual_vram_gb_bytes
-                   eject_models=1, is_distorch=0: mutually exclusive (shouldn't occur)
-                   eject_models=1, is_distorch=1: return MAX memory to force eviction"""
-                multigpu_memory_log("unload_distorch_model_memory_check", "start")
-                model_name = type(getattr(getattr(self, 'model', None), 'model', None)).__name__ if getattr(getattr(self, 'model', None), 'model', None) else "Unknown"
-                logger.mgpu_mm_log(f"[MEM_REPORT][{model_name}] Memory assessment requested for model on device: {device}")
-
-                # GET ORIGINAL MEMORY REQUIREMENT
-                original_result = original_loaded_model_memory_required(self, device)
-                original_gb = original_result / (1024**3) if original_result else 0
-
-                # CHECK FOR EJECT_MODELS PROPERTY
-                has_eject_models = hasattr(getattr(getattr(self, 'model', None), 'model', None), '_mgpu_eject_models')
-                logger.mgpu_mm_log(f"[MEM_REPORT][{model_name}] Original needs: {original_gb:.2f}GB, has_eject_models={has_eject_models}")
-
-                # CHECK IF DISTORCH MODEL WITH VIRTUAL VRAM PROPERTY
-                is_distorch_model = hasattr(getattr(getattr(self, 'model', None), 'model', None), '_mgpu_virtual_vram_gb')
-
-                # TRUTH TABLE APPLICATION
-                if has_eject_models:
-                    if not is_distorch_model:
-                        logger.mgpu_mm_log(f"[MEM_REPORT][{model_name}] ERROR: eject_models=1 but not DisTorch (mutually exclusive)")
-                    # eject_models=1, is_distorch=1: RETURN MAX MEMORY TO FORCE EVICTION
-                    logger.mgpu_mm_log(f"[MEM_REPORT][{model_name}] eject_models=1, is_distorch={is_distorch_model} → FORCING EVICTION WITH MAX MEMORY")
-
-                    # DISABLED: Manual ejection should happen automatically when MAX memory is returned
-                    DISABLE_MANUAL_EJECTION = True  # TODO: Remove this once auto-eviction confirmed
-                    if not DISABLE_MANUAL_EJECTION:
-                        logger.mgpu_mm_log(f"======= DIRECT MODEL EJECTION START[{model_name}] =======")
-                        logger.mgpu_mm_log(f"[DIRECT_EJECTION][{model_name}] Current loaded models count: {len(mm.current_loaded_models)}")
-
-                        # DIRECTLY UNLOAD ALL MODELS
-                        models_unloaded = []
-                        for i, lm in enumerate(mm.current_loaded_models):
-                            model_name_to_eject = type(getattr(lm.model, 'model', lm.model)).__name__ if lm.model else 'Unknown'
-                            logger.mgpu_mm_log(f"[DIRECT_EJECTION][{model_name}] UNLOADING MODEL {i+1}/{len(mm.current_loaded_models)}: {model_name_to_eject}")
-                            try:
-                                lm.model_unload(unpatch_weights=True)
-                                models_unloaded.append(model_name_to_eject)
-                                logger.mgpu_mm_log(f"[DIRECT_EJECTION][{model_name}] SUCCESSFULLY UNLOADED: {model_name_to_eject}")
-                            except Exception as e:
-                                logger.mgpu_mm_log(f"[DIRECT_EJECTION][{model_name}] ERROR unloading {model_name_to_eject}: {e}")
-
-                        mm.current_loaded_models = []
-                        logger.mgpu_mm_log(f"[DIRECT_EJECTION][{model_name}] Models unloaded: {models_unloaded}")
-                        logger.mgpu_mm_log(f"======= DIRECT MODEL EJECTION COMPLETE[{model_name}] =======")
-                        multigpu_memory_log("eject_models_post", "complete")
-
-                    # RETURN MAX MEMORY - Should trigger auto-eviction by Comfy Core
-                    total_device_memory = mm.get_total_memory(device)
-                    max_gb = total_device_memory / (1024**3)
-                    logger.mgpu_mm_log(f"[MEM_REPORT][{model_name}] Returning MAX memory ({max_gb:.2f}GB) for auto-eviction by Comfy Core")
-                    return total_device_memory
-
-                elif is_distorch_model:
-                    # eject_models=0, is_distorch=1: SUBTRACT VIRTUAL VRAM FROM ORIGINAL
-                    virtual_vram_gb = getattr(getattr(self, 'model', None), 'model', None)._mgpu_virtual_vram_gb
+        def patched_load_models_gpu(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False):
+            from comfy.model_management import cleanup_models_gc, get_free_memory, free_memory, current_loaded_models
+            from comfy.model_management import VRAMState, vram_state, lowvram_available, MIN_WEIGHT_MEMORY_RATIO
+            from comfy.model_management import minimum_inference_memory, extra_reserved_memory, is_device_cpu
+            
+            multigpu_memory_log("load_models_gpu_top_level", "start")
+
+            cleanup_models_gc()
+
+            inference_memory = minimum_inference_memory()
+            extra_reserved_mem = extra_reserved_memory()
+            memory_required_total = memory_required + extra_reserved_mem
+            extra_mem = max(inference_memory, memory_required_total)
+            if minimum_memory_required is None:
+                minimum_memory_required = extra_mem
+            else:
+                minimum_memory_required = max(inference_memory, minimum_memory_required + extra_reserved_mem)
+
+            models_temp = set()
+            for m in models:
+                models_temp.add(m)
+                for mm_patch in m.model_patches_models():
+                    models_temp.add(mm_patch)
+
+            models = models_temp
+
+            models_to_load = []
+
+            for x in models:
+                loaded_model = mm.LoadedModel(x)
+                try:
+                    loaded_model_index = current_loaded_models.index(loaded_model)
+                except:
+                    loaded_model_index = None
+
+                if loaded_model_index is not None:
+                    loaded = current_loaded_models[loaded_model_index]
+                    loaded.currently_used = True
+                    models_to_load.append(loaded)
+                else:
+                    if hasattr(x, "model"):
+                        logging.info(f"Requested to load {x.model.__class__.__name__}")
+                    models_to_load.append(loaded_model)
+
+            for loaded_model in models_to_load:
+                to_unload = []
+                for i in range(len(current_loaded_models)):
+                    if loaded_model.model.is_clone(current_loaded_models[i].model):
+                        to_unload = [i] + to_unload
+                for i in to_unload:
+                    model_to_unload = current_loaded_models.pop(i)
+                    model_to_unload.model.detach(unpatch_all=False)
+                    model_to_unload.model_finalizer.detach()
+
+            # DisTorch Processing
+            total_memory_required = {}
+            eject_device = None
+
+            for loaded_model in models_to_load:
+                device = loaded_model.device
+                base_memory = loaded_model.model_memory_required(device)
+
+                # Check DisTorch flags
+                is_distorch = hasattr(loaded_model.model.model, '_mgpu_virtual_vram_gb')
+                has_eject = hasattr(loaded_model.model.model, '_mgpu_eject_models')
+
+                if has_eject:
+                    eject_device = device
+                    logger.mgpu_mm_log("DisTorch eject_models=True, is_distorch=True - MAX memory eviction")
+
+                if is_distorch:
+                    # is_distorch=True: use compute device allocation size
+                    virtual_vram_gb = loaded_model.model.model._mgpu_virtual_vram_gb
                     virtual_vram_bytes = virtual_vram_gb * (1024**3)
-                    adjusted_result = max(0, original_result - virtual_vram_bytes)
-                    adjusted_gb = adjusted_result / (1024**3) if adjusted_result else 0
+                    adjusted_memory = max(0, base_memory - virtual_vram_bytes)
+                    total_memory_required[device] = total_memory_required.get(device, 0) + adjusted_memory
+                    logger.mgpu_mm_log(f"DisTorch is_distorch=True, model adjusted {(base_memory - virtual_vram_bytes)/(1024**3):.2f}GB for device {device}")
+                else:
+                    # is_distorch=False: use full model size
+                    total_memory_required[device] = total_memory_required.get(device, 0) + base_memory
+                    logger.mgpu_mm_log(f"[LOAD_MODELS_GPU] Standard model {(base_memory)/(1024**3):.2f}GB for device {device}")
+
+            for device in total_memory_required:
+                if device != torch.device("cpu"):
+                    requested_mem = total_memory_required[device] * 1.1 + extra_mem
+                    logger.mgpu_mm_log(f"[FREE_MEMORY_CALL] Device {device}: requesting {requested_mem/(1024**3):.2f}GB = {total_memory_required[device]/(1024**3):.2f}GB * 1.1 + {extra_mem/(1024**3):.2f}GB inference")
+            
+            
+            multigpu_memory_log("free_memory", "pre")
+
+            for device in total_memory_required:
+                if device != torch.device("cpu"):
+                    if device == eject_device:
+                        total_device_memory = mm.get_total_memory(device)
+                        logger.mgpu_mm_log(f"[LOAD_MODELS_GPU] eject_models=1, is_distorch=1 → using MAX memory ({total_device_memory/(1024**3):.2f}GB) for eviction")
+                        free_memory(total_device_memory,device)
+                    else:
+                        logger.mgpu_mm_log(f"[LOAD_MODELS_GPU] eject_models=0, using Comfy Core Computed memory ({(total_memory_required[device] * 1.1 + extra_mem)/(1024**3):.2f}GB) for eviction")
+                        free_memory(total_memory_required[device] * 1.1 + extra_mem, device)
+            
+            multigpu_memory_log("free_memory/minimum_memory_required", "post/pre")
+
+            for device in total_memory_required:
+                if device != torch.device("cpu"):
+                    free_mem = get_free_memory(device)
+                    free_mem_gb = free_mem / (1024**3)
+                    min_required_gb = minimum_memory_required / (1024**3)
+                    logger.mgpu_mm_log(f"[MIN_MEMORY_CHECK] Device {device}: free={free_mem_gb:.2f}GB, required={min_required_gb:.2f}GB, will_evict={free_mem < minimum_memory_required}")
+
+                    if free_mem < minimum_memory_required:
+                        models_l = free_memory(minimum_memory_required, device)
+                        logger.mgpu_mm_log(f"[EVICTION] Device {device}: unloaded {len(models_l)} models due to insufficient memory")
+                        logging.info("{} models unloaded.".format(len(models_l)))
+
+            multigpu_memory_log("minimum_memory_required", "post")
+
+            for loaded_model in models_to_load:
+                model = loaded_model.model
+                torch_dev = model.load_device
+                if is_device_cpu(torch_dev):
+                    vram_set_state = VRAMState.DISABLED
+                else:
+                    vram_set_state = vram_state
+                lowvram_model_memory = 0
+                if lowvram_available and (vram_set_state == VRAMState.LOW_VRAM or vram_set_state == VRAMState.NORMAL_VRAM) and not force_full_load:
+                    loaded_memory = loaded_model.model_loaded_memory()
+                    current_free_mem = get_free_memory(torch_dev) + loaded_memory
 
-                    logger.mgpu_mm_log(f"[MEM_REPORT][{model_name}] eject_models=0, is_distorch=1 → adjusted {original_gb:.2f}GB - {virtual_vram_gb:.2f}GB = {adjusted_gb:.2f}GB (DisTorch allocation)")
-                    multigpu_memory_log("distorch_allocation", "reported")
-                    return adjusted_result
+                    lowvram_model_memory = max(128 * 1024 * 1024, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory()))
+                    lowvram_model_memory = max(0.1, lowvram_model_memory - loaded_memory)
 
-                else:
-                    # eject_models=0, is_distorch=0: RETURN ORIGINAL
-                    logger.mgpu_mm_log(f"[MEM_REPORT][{model_name}] eject_models=0, is_distorch=0 → returning original {original_gb:.2f}GB")
-                    multigpu_memory_log("keep_loaded_memory_check", "end")
-                    return original_result
+                if vram_set_state == VRAMState.NO_VRAM:
+                    lowvram_model_memory = 0.1
+
+                loaded_model.model_load(lowvram_model_memory, force_patch_weights=force_patch_weights)
+                current_loaded_models.insert(0, loaded_model)
 
-            mm.LoadedModel.model_memory_required = patched_loaded_model_memory_required
+        # Replace the module function
+        mm.load_models_gpu = patched_load_models_gpu
 
         original_partially_load = comfy.model_patcher.ModelPatcher.partially_load
 
 
@@ -14,7 +14,7 @@ This node automatically detects models located in the `ComfyUI/models/clip` fold
 | `virtual_vram_gb` | `FLOAT` | Amount of virtual VRAM in gigabytes to allocate for distributed tensor management (default: 4.0, range: 0.0-128.0). |
 | `donor_device` | `STRING` | Device to donate VRAM from when allocating virtual memory (default: 'cpu'). |
 | `expert_mode_allocations` | `STRING` | Advanced allocation string for expert users to manually specify device/ratio distributions (e.g., 'cuda:0,50%;cpu,*'). |
-| `keep_loaded` | `BOOLEAN` | Whether to keep the model loaded when triggering memory cleanup operations (default: true). |
+| `eject_models` | `BOOLEAN` | Whether to unload ALL models from the target device before loading this model, enabling deterministic model eviction for testing and memory management (default: false for CLIP loaders). |
 
 ## Outputs
 
 
@@ -14,7 +14,7 @@ This node automatically detects models located in the `ComfyUI/models/clip` and
 | `virtual_vram_gb` | `FLOAT` | Amount of virtual VRAM in gigabytes to allocate for distributed tensor management (default: 4.0, range: 0.0-128.0). |
 | `donor_device` | `STRING` | Device to donate VRAM from when allocating virtual memory (default: 'cpu'). |
 | `expert_mode_allocations` | `STRING` | Advanced allocation string for expert users to manually specify device/ratio distributions (e.g., 'cuda:0,50%;cpu,*'). |
-| `keep_loaded` | `BOOLEAN` | Whether to keep the model loaded when triggering memory cleanup operations (default: true). |
+| `eject_models` | `BOOLEAN` | Whether to unload ALL models from the target device before loading this model, enabling deterministic model eviction for testing and memory management (default: false for CLIP loaders). |
 
 ## Outputs
 
 
@@ -13,7 +13,7 @@ This node automatically detects models located in the `ComfyUI/models/clip_visio
 | `virtual_vram_gb` | `FLOAT` | Amount of virtual VRAM in gigabytes to allocate for distributed tensor management (default: 4.0, range: 0.0-128.0). |
 | `donor_device` | `STRING` | Device to donate VRAM from when allocating virtual memory (default: 'cpu'). |
 | `expert_mode_allocations` | `STRING` | Advanced allocation string for expert users to manually specify device/ratio distributions (e.g., 'cuda:0,50%;cpu,*'). |
-| `keep_loaded` | `BOOLEAN` | Whether to keep the model loaded when triggering memory cleanup operations (default: true). |
+| `eject_models` | `BOOLEAN` | Whether to unload ALL models from the target device before loading this model, enabling deterministic model eviction for testing and memory management (default: true). |
 
 ## Outputs
 
 
@@ -13,7 +13,7 @@ This node automatically detects models located in the `ComfyUI/models/checkpoint
 | `virtual_vram_gb` | `FLOAT` | Amount of virtual VRAM in gigabytes to allocate for distributed tensor management (default: 4.0, range: 0.0-128.0). |
 | `donor_device` | `STRING` | Device to donate VRAM from when allocating virtual memory (default: 'cpu'). |
 | `expert_mode_allocations` | `STRING` | Advanced allocation string for expert users to manually specify device/ratio distributions (e.g., 'cuda:0,50%;cpu,*'). |
-| `keep_loaded` | `BOOLEAN` | Whether to keep the model loaded when triggering memory cleanup operations (default: true). |
+| `eject_models` | `BOOLEAN` | Whether to unload ALL models from the target device before loading this model, enabling deterministic model eviction for testing and memory management (default: true). |
 
 ## Outputs
 
 
@@ -13,7 +13,7 @@ This node automatically detects models located in the `ComfyUI/models/controlnet
 | `virtual_vram_gb` | `FLOAT` | Amount of virtual VRAM in gigabytes to allocate for distributed tensor management (default: 4.0, range: 0.0-128.0). |
 | `donor_device` | `STRING` | Device to donate VRAM from when allocating virtual memory (default: 'cpu'). |
 | `expert_mode_allocations` | `STRING` | Advanced allocation string for expert users to manually specify device/ratio distributions (e.g., 'cuda:0,50%;cpu,*'). |
-| `keep_loaded` | `BOOLEAN` | Whether to keep the model loaded when triggering memory cleanup operations (default: true). |
+| `eject_models` | `BOOLEAN` | Whether to unload ALL models from the target device before loading this model, enabling deterministic model eviction for testing and memory management (default: true). |
 
 ## Outputs
 
 
@@ -13,7 +13,7 @@ This node loads ControlNet models directly from HuggingFace model repositories b
 | `virtual_vram_gb` | `FLOAT` | Amount of virtual VRAM in gigabytes to allocate for distributed tensor management (default: 4.0, range: 0.0-128.0). |
 | `donor_device` | `STRING` | Device to donate VRAM from when allocating virtual memory (default: 'cpu'). |
 | `expert_mode_allocations` | `STRING` | Advanced allocation string for expert users to manually specify device/ratio distributions (e.g., 'cuda:0,50%;cpu,*'). |
-| `keep_loaded` | `BOOLEAN` | Whether to keep the model loaded when triggering memory cleanup operations (default: true). |
+| `eject_models` | `BOOLEAN` | Whether to unload ALL models from the target device before loading this model, enabling deterministic model eviction for testing and memory management (default: true). |
 
 ## Outputs