Identified a long-standing bug where fully-allocated CLIP (for example 99G of VirtualVRAM = 100% of major blocks no matter the model) proceeded to execute on the donor device (e.g. cpu) instead of the indicated compute device. Turns out, it only happens when *all* blocks are identified to go onto the donor card. In the case of the donor being the cpu this was irritatingly slow.

pollockjj · pollockjj · commit edc8a4dd2b6e · 2025-09-14T00:07:47.000-05:00
On a 4x PCIe bus, swapping a normal CLIP-sized number of layers once/twice (for neg) into compute should be the optimal solution:  Reside on `cpu`, use the optimized cuda kernals for computation JiT on `compute`, discard layers once used (residing permenantly on `cpu`), then move efficently to the main UNet computation.
diff --git a/distorch_2.py b/distorch_2.py
@@ -86,8 +86,14 @@ def new_partially_load(self, device_to, extra_memory=0, full_load=False, force_p
 
             mem_counter = 0
 
-            logger.info(f"[MultiGPU_DisTorch2] Using static allocation for model {debug_hash[:8]}")
-            device_assignments = analyze_safetensor_loading(self, allocations)
+            is_clip_model = getattr(self, 'is_clip', False)
+            if is_clip_model:
+                logger.info(f"[MultiGPU_DisTorch2] Using CLIP-specific allocation for model {debug_hash[:8]} (HEAD PRESERVATION ENABLED)")
+                device_assignments = analyze_safetensor_loading_clip(self, allocations)
+            else:
+                logger.debug(f"[MultiGPU_DisTorch2] Using standard allocation for model {debug_hash[:8]} (UNET/VAE - UNTOUCHED)")
+                device_assignments = analyze_safetensor_loading(self, allocations)
+            
             model_original_dtype = comfy.utils.weight_dtype(self.model.state_dict())
             high_precision_loras = self.model._distorch_high_precision_loras
             loading = self._load_list()
@@ -164,6 +170,7 @@ def new_partially_load(self, device_to, extra_memory=0, full_load=False, force_p
 def analyze_safetensor_loading(model_patcher, allocations_string):
     """
     Analyze and distribute safetensor model blocks across devices
+    Target for refactor back into one function once stability for CLIP is established.
     """
     DEVICE_RATIOS_DISTORCH = {}
     device_table = {}
@@ -366,6 +373,211 @@ def analyze_safetensor_loading(model_patcher, allocations_string):
         "block_assignments": block_assignments
     }
 
+
+def analyze_safetensor_loading_clip(model_patcher, allocations_string):
+    """
+    CLIP-SPECIFIC: A 1:1 clone of the working UNET allocation logic with the
+    single required modification to preserve head-blocks on the compute device.
+    All other logic and UX (logging, etc.) is identical to the original.
+    Target for refactor once stability for CLIP is established.
+    """
+    DEVICE_RATIOS_DISTORCH = {}
+    device_table = {}
+    distorch_alloc = allocations_string
+    virtual_vram_gb = 0.0
+
+    distorch_alloc, virtual_vram_str = allocations_string.split('#')
+
+    compute_device = virtual_vram_str.split(';')[0]
+
+    logger.info(f"[MultiGPU_DisTorch2_CLIP] CLIP Compute Device: {compute_device}")
+
+    if not distorch_alloc:
+        mode = "fraction"
+        logger.info("[MultiGPU_DisTorch2_CLIP] Expert String Examples:")
+        logger.info("  Direct(byte) Mode - cuda:0,500mb;cuda:1,3.0g;cpu,5gb* -> '*' cpu = over/underflow device, put 0.50gb on cuda0, 3.00gb on cuda1, and 5.00gb (or the rest) on cpu")
+        logger.info("  Ratio(%) Mode - cuda:0,8%;cuda:1,8%;cpu,4% -> 8:8:4 ratio, put 40% on cuda0, 40% on cuda1, and 20% on cpu")
+        distorch_alloc = calculate_safetensor_vvram_allocation(model_patcher, virtual_vram_str)
+
+    elif any(c in distorch_alloc.lower() for c in ['g', 'm', 'k', 'b']):
+        mode = "byte"
+        distorch_alloc = calculate_fraction_from_byte_expert_string(model_patcher, distorch_alloc)
+    elif "%" in distorch_alloc:
+        mode = "ratio"
+        distorch_alloc = calculate_fraction_from_ratio_expert_string(model_patcher, distorch_alloc)
+        
+    all_devices = get_device_list()
+    present_devices = {item.split(',')[0] for item in distorch_alloc.split(';') if ',' in item}
+    for device in all_devices:
+        if device not in present_devices:
+            distorch_alloc += f";{device},0.0"
+
+    logger.info(f"[MultiGPU_DisTorch2_CLIP] Final CLIP Allocation String: {distorch_alloc}")
+
+    eq_line = "=" * 50
+    dash_line = "-" * 50
+    fmt_assign = "{:<18}{:>7}{:>14}{:>10}"
+
+    for allocation in distorch_alloc.split(';'):
+        if ',' not in allocation:
+            continue
+        dev_name, fraction = allocation.split(',')
+        fraction = float(fraction)
+        total_mem_bytes = mm.get_total_memory(torch.device(dev_name))
+        alloc_gb = (total_mem_bytes * fraction) / (1024**3)
+        DEVICE_RATIOS_DISTORCH[dev_name] = alloc_gb
+        device_table[dev_name] = {
+            "fraction": fraction,
+            "total_gb": total_mem_bytes / (1024**3),
+            "alloc_gb": alloc_gb
+        }
+
+    logger.info(eq_line)
+    logger.info("    DisTorch2 CLIP Model Device Allocations")
+    logger.info(eq_line)
+    
+    fmt_rosetta = "{:<8}{:>9}{:>9}{:>11}{:>10}"
+    logger.info(fmt_rosetta.format("Device", "VRAM GB", "Dev %", "Model GB", "Dist %"))
+    logger.info(dash_line)
+
+    sorted_devices = sorted(device_table.keys(), key=lambda d: (d == "cpu", d))
+    
+    total_allocated_model_bytes = sum(d["alloc_gb"] * (1024**3) for d in device_table.values())
+
+    for dev in sorted_devices:
+        total_dev_gb = device_table[dev]["total_gb"]
+        alloc_fraction = device_table[dev]["fraction"]
+        alloc_gb = device_table[dev]["alloc_gb"]
+        
+        dist_ratio_percent = (alloc_gb * (1024**3) / total_allocated_model_bytes) * 100 if total_allocated_model_bytes > 0 else 0
+
+        logger.info(fmt_rosetta.format(
+            dev,
+            f"{total_dev_gb:.2f}",
+            f"{alloc_fraction*100:.1f}%",
+            f"{alloc_gb:.2f}",
+            f"{dist_ratio_percent:.1f}%"
+        ))
+    
+    logger.info(dash_line)
+
+    block_summary = {}
+    memory_by_type = defaultdict(int)
+
+    raw_block_list = model_patcher._load_list()
+    total_memory = sum(module_size for module_size, _, _, _ in raw_block_list)
+
+    # Split the model into head and distributable parts
+    head_keywords = ['embed', 'wte', 'wpe', 'token_embedding', 'position_embedding']
+    head_blocks = []
+    distributable_blocks_raw = []
+    head_memory = 0
+
+    for module_size, module_name, module_object, params in raw_block_list:
+        if any(keyword in module_name.lower() for keyword in head_keywords):
+            head_blocks.append((module_size, module_name, module_object, params))
+        else:
+            distributable_blocks_raw.append((module_size, module_name, module_object, params))
+
+    MIN_BLOCK_THRESHOLD = total_memory * 0.0001
+    all_blocks = []
+
+    for module_size, module_name, module_object, params in raw_block_list:
+        block_type = type(module_object).__name__
+        block_summary[block_type] = block_summary.get(block_type, 0) + 1
+        memory_by_type[block_type] += module_size
+        all_blocks.append((module_name, module_object, block_type, module_size))
+
+    # Use the distributable part for actual allocation logic
+    distributable_all_blocks = []
+    for module_size, module_name, module_object, params in distributable_blocks_raw:
+        distributable_all_blocks.append((module_name, module_object, type(module_object).__name__, module_size))
+
+    block_list = [b for b in distributable_all_blocks if b[3] >= MIN_BLOCK_THRESHOLD]
+    tiny_block_list = [b for b in distributable_all_blocks if b[3] < MIN_BLOCK_THRESHOLD]
+    
+    logger.info("    DisTorch2 CLIP Model Layer Distribution")
+    logger.info(dash_line)
+    fmt_layer = "{:<18}{:>7}{:>14}{:>10}"
+    logger.info(fmt_layer.format("Layer Type", "Layers", "Memory (MB)", "% Total"))
+    logger.info(dash_line)
+    
+    for layer_type, count in block_summary.items():
+        mem_mb = memory_by_type[layer_type] / (1024 * 1024)
+        mem_percent = (memory_by_type[layer_type] / total_memory) * 100 if total_memory > 0 else 0
+        logger.info(fmt_layer.format(layer_type[:18], str(count), f"{mem_mb:.2f}", f"{mem_percent:.1f}%"))
+    
+    logger.info(dash_line)
+
+    block_assignments = {}
+
+    # Pre-assign head blocks and calculate their memory usage
+    for module_size, module_name, module_object, params in head_blocks:
+        block_assignments[module_name] = compute_device
+        head_memory += module_size
+    if head_blocks:
+        logger.info(f"[MultiGPU_DisTorch2_CLIP] Preserving {len(head_blocks)} head layer(s) ({head_memory / (1024*1024):.2f} MB) on compute device: {compute_device}")
+    donor_devices = [d for d in sorted_devices]
+    donor_quotas = {
+        dev: device_table[dev]["alloc_gb"] * (1024**3)
+        for dev in donor_devices
+    }
+    # Adjust compute_device quota to account for the locked head
+    if compute_device in donor_quotas:
+        donor_quotas[compute_device] = max(0, donor_quotas[compute_device] - head_memory)
+
+    for block_name, module, block_type, block_memory in reversed(block_list):
+        assigned_to_donor = False
+        for donor in donor_devices:
+            if donor_quotas[donor] >= block_memory:
+                block_assignments[block_name] = donor
+                donor_quotas[donor] -= block_memory
+                assigned_to_donor = True
+                break # Move to the next block
+        
+        if not assigned_to_donor:
+            block_assignments[block_name] = compute_device
+
+    for block_name, module, block_type, block_memory in tiny_block_list:
+        block_assignments[block_name] = compute_device
+
+    device_assignments = {device: [] for device in DEVICE_RATIOS_DISTORCH.keys()}
+    for block_name, device in block_assignments.items():
+        # Find the block in the original list to get all its info
+        for b_name, b_module, b_type, b_mem in all_blocks:
+            if b_name == block_name:
+                device_assignments[device].append((b_name, b_module, b_type, b_mem))
+                break
+
+    logger.info("DisTorch2 CLIP Model Final Device/Layer Assignments")
+    logger.info(dash_line)
+    logger.info(fmt_assign.format("Device", "Layers", "Memory (MB)", "% Total"))
+    logger.info(dash_line)
+    
+    device_memories = defaultdict(int)
+    device_counts = defaultdict(int)
+    for device, blocks in device_assignments.items():
+        for b_name, b_module, b_type, b_mem in blocks:
+            device_memories[device] += b_mem
+            device_counts[device] += 1
+
+    sorted_assignments = sorted(device_memories.keys(), key=lambda d: (d == "cpu", d))
+
+    for dev in sorted_assignments:
+        if device_counts[dev] == 0:
+            continue
+        mem_mb = device_memories[dev] / (1024 * 1024)
+        mem_percent = (device_memories[dev] / total_memory) * 100 if total_memory > 0 else 0
+        logger.info(fmt_assign.format(dev, str(device_counts[dev]), f"{mem_mb:.2f}", f"{mem_percent:.1f}%"))
+    
+    logger.info(dash_line)
+
+    return {
+        "device_assignments": device_assignments,
+        "block_assignments": block_assignments
+    }
+
+
 def parse_memory_string(mem_str):
     """Parses a memory string (e.g., '4.0g', '512M') and returns bytes."""
     mem_str = mem_str.strip().lower()