refactor: Simplify checkpoint loading and fix text encoder device

pollockjj · pollockjj · commit 0b1511edee76 · 2025-08-31T01:00:53.000-05:00
This commit introduces two main improvements: refactoring the checkpoint loading mechanism and fixing the initial device placement for the text encoder (CLIP).

1.  **Fix Text Encoder Device Handling:**
    - A new patch is applied to `mm.text_encoder_initial_device` to gain control over the device used when the text encoder is first loaded.
    - The `CLIPLoader` override now forces `device='default'` to ensure ComfyUI's patching mechanism is triggered correctly, preventing the text encoder from being incorrectly placed on the wrong GPU.

2.  **Refactor Checkpoint Loaders:**
    - Removed the global stores (`checkpoint_dtype_store`, `checkpoint_half_store`, `checkpoint_config_store`).
    - The `CheckpointLoaderSimpleMultiGPU` and `AdvCheckpointLoaderMultiGPU` nodes now use arguments and ComfyUI's internal defaults directly. This simplifies the logic, reduces global state, and makes the code easier to follow.

Additionally, log message prefixes have been updated to be more descriptive, aiding in debugging.
diff --git a/__init__.py b/__init__.py
@@ -23,7 +23,7 @@
     handler.setFormatter(formatter)
     logger.addHandler(handler)
     logger.setLevel(log_level)
-    logger.info(f"[MultiGPU] Logger initialized with level: {logging.getLevelName(log_level)}")
+    logger.info(f"[MultiGPU Initialization] Logger initialized with level: {logging.getLevelName(log_level)}")
 
 
 # Global device state management
@@ -33,12 +33,13 @@
 def set_current_device(device):
     global current_device
     current_device = device
-    logger.info(f"[MultiGPU] current_device set to: {device}")
+    logger.info(f"[MultiGPU Initialization] current_device set to: {device}")
 
 def set_current_text_encoder_device(device):
     global current_text_encoder_device
     current_text_encoder_device = device
-    logger.info(f"[MultiGPU] current_text_encoder_device set to: {device}")
+    current_text_encoder_initial_device = device
+    logger.info(f"[MultiGPU Initialization] current_text_encoder_device and current_text_encoder_initial_device set to: {device}")
 
 def override_class(cls):
     class NodeOverride(cls):
@@ -55,15 +56,12 @@ def INPUT_TYPES(s):
         FUNCTION = "override"
 
         def override(self, *args, device=None, **kwargs):
-            logger.debug(f"[MultiGPU] override_class called for {cls.__name__} with device={device}")
-            
+
             if device is not None:
                 set_current_device(device)
-            
             fn = getattr(super(), cls.FUNCTION)
             out = fn(*args, **kwargs)
-            logger.debug(f"[MultiGPU] override_class for {cls.__name__} completed successfully")
-            
+
             return out
 
     return NodeOverride
@@ -85,7 +83,7 @@ def INPUT_TYPES(s):
         def override(self, *args, device=None, **kwargs):
             if device is not None:
                 set_current_text_encoder_device(device)
-            
+            kwargs['device'] = 'default'
             fn = getattr(super(), cls.FUNCTION)
             out = fn(*args, **kwargs)
             
@@ -100,7 +98,7 @@ def get_torch_device_patched():
     else:
         devs = set(get_device_list())
         device = torch.device(current_device) if str(current_device) in devs else torch.device("cpu")
-    logger.debug(f"[MultiGPU] get_torch_device_patched returning device: {device} (current_device={current_device})")
+    logger.debug(f"[MultiGPU Core Patching] get_torch_device_patched returning device: {device} (current_device={current_device})")
     return device
 
 def text_encoder_device_patched():
@@ -110,16 +108,22 @@ def text_encoder_device_patched():
     else:
         devs = set(get_device_list())
         device = torch.device(current_text_encoder_device) if str(current_text_encoder_device) in devs else torch.device("cpu")
-    logger.debug(f"[MultiGPU] text_encoder_device_patched returning device: {device} (current_text_encoder_device={current_text_encoder_device})")
+    logger.debug(f"[MultiGPU Core Patching] text_encoder_device_patched returning device: {device} (current_text_encoder_device={current_text_encoder_device})")
     return device
 
-# Apply patches
-logger.info(f"[MultiGPU] Patching mm.get_torch_device and mm.text_encoder_device")
-logger.debug(f"[MultiGPU] Initial current_device: {current_device}")
-logger.debug(f"[MultiGPU] Initial current_text_encoder_device: {current_text_encoder_device}")
+def text_encoder_initial_device_patched(*args, **kwargs):
+    logger.debug(f"[MultiGPU Core Patching] text_encoder_initial_device_patched called with args={args}, kwargs={kwargs}")
+    # look at this later - I am not convinced that this isn't the better choice:
+    # return text_encoder_device_patched() 
+    return mm.text_encoder_device()
+
+
+logger.info(f"[MultiGPU Core Patching] Patching mm.get_torch_device, mm.text_encoder_device, and mm.text_encoder_initial_device")
+logger.debug(f"[MultiGPU DEBUG] Initial current_device: {current_device}")
+logger.debug(f"[MultiGPU DEBUG] Initial current_text_encoder_device: {current_text_encoder_device}")
 mm.get_torch_device = get_torch_device_patched
 mm.text_encoder_device = text_encoder_device_patched
-logger.debug(f"[MultiGPU] Patches applied successfully")
+mm.text_encoder_initial_device = text_encoder_initial_device_patched
 
 def check_module_exists(module_path):
     full_path = os.path.join(folder_paths.get_folder_paths("custom_nodes")[0], module_path)
diff --git a/checkpoint_multigpu.py b/checkpoint_multigpu.py
@@ -17,11 +17,9 @@
 
 logger = logging.getLogger("MultiGPU")
 
-# --- Global Stores for Configuration ---
 checkpoint_device_config = {}
 checkpoint_distorch_config = {}
 
-# --- Original Function Store ---
 original_load_state_dict_guess_config = None
 
 def patch_load_state_dict_guess_config():
@@ -45,34 +43,29 @@ def patched_load_state_dict_guess_config(sd, output_vae=True, output_clip=True,
     
     from . import set_current_device, set_current_text_encoder_device, current_device, current_text_encoder_device
     
-    # --- Check for custom configuration ---
     sd_size = sum(p.numel() for p in sd.values() if hasattr(p, 'numel'))
     config_hash = str(sd_size)
     device_config = checkpoint_device_config.get(config_hash)
     distorch_config = checkpoint_distorch_config.get(config_hash)
 
     if not device_config and not distorch_config:
-        # No config, fall back to original untouched function
         return original_load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options, metadata)
 
     logger.info("--- [MultiGPU] ENTERING Patched Checkpoint Loader ---")
     logger.info(f"Received Device Config: {device_config}")
     logger.info(f"Received DisTorch2 Config: {distorch_config}")
 
-    # --- Start of Rewritten Logic ---
     clip = None
     clipvision = None
     vae = None
     model = None
     model_patcher = None
     
-    # Store original device contexts to restore later
     original_main_device = current_device
     original_clip_device = current_text_encoder_device
     logger.info(f"Saved original device contexts: UNet/VAE='{original_main_device}', CLIP='{original_clip_device}'")
 
     try:
-        # --- Model Configuration Detection (Replicated from original) ---
         diffusion_model_prefix = comfy.model_detection.unet_prefix_from_state_dict(sd)
         parameters = comfy.utils.calculate_parameters(sd, diffusion_model_prefix)
         weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix)
@@ -94,31 +87,26 @@ def patched_load_state_dict_guess_config(sd, output_vae=True, output_clip=True,
             weight_dtype = None
         
         model_config.custom_operations = model_options.get("custom_operations", None)
-        unet_dtype = model_options.get("dtype", model_options.get("weight_dtype", None)))
+        unet_dtype = model_options.get("dtype", model_options.get("weight_dtype", None))
         if unet_dtype is None:
             unet_dtype = mm.unet_dtype(model_params=parameters, supported_dtypes=unet_weight_dtype, weight_dtype=weight_dtype)
         
-        manual_cast_dtype = mm.unet_manual_cast(unet_dtype, torch.device(device_config.get('unet_device')), model_config.supported_inference_dtypes)
+        unet_compute_device = device_config.get('unet_device', original_main_device)
+        manual_cast_dtype = mm.unet_manual_cast(unet_dtype, torch.device(unet_compute_device), model_config.supported_inference_dtypes)
         model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)
         logger.info(f"UNet DType: {unet_dtype}, Manual Cast: {manual_cast_dtype}")
 
-        # --- CLIP Vision Loading ---
+
         if model_config.clip_vision_prefix is not None and output_clipvision:
-            logger.info("--- Loading CLIP Vision ---")
             clipvision = comfy.clip_vision.load_clipvision_from_sd(sd, model_config.clip_vision_prefix, True)
-            logger.info("CLIP Vision Loaded.")
 
-        # --- UNet Loading Block ---
         if output_model:
-            logger.info("--- Loading UNet ---")
             unet_compute_device = device_config.get('unet_device', original_main_device)
-            set_current_device(unet_compute_device)
-            logger.info(f"Set UNet context to: {unet_compute_device}")
-            
+            set_current_device(unet_compute_device)            
             inital_load_device = mm.unet_inital_load_device(parameters, unet_dtype)
-            logger.info(f"UNet initial load device: {inital_load_device}")
-            
+
             model = model_config.get_model(sd, diffusion_model_prefix, device=inital_load_device)
+
             model_patcher = comfy.model_patcher.ModelPatcher(model, load_device=unet_compute_device, offload_device=mm.unet_offload_device())
             
             if distorch_config and 'unet_allocation' in distorch_config:
@@ -131,28 +119,18 @@ def patched_load_state_dict_guess_config(sd, output_vae=True, output_clip=True,
                 logger.info(f"Stored DisTorch2 config for UNet (hash {model_hash[:8]}): {distorch_config['unet_allocation']}")
 
             model.load_model_weights(sd, diffusion_model_prefix)
-            logger.info("UNet Loaded.")
 
-        # --- VAE Loading Block ---
         if output_vae:
-            logger.info("--- Loading VAE ---")
             vae_target_device = torch.device(device_config.get('vae_device', original_main_device))
             set_current_device(vae_target_device) # Use main device context for VAE
-            logger.info(f"Set VAE context to: {vae_target_device}")
             
             vae_sd = comfy.utils.state_dict_prefix_replace(sd, {k: "" for k in model_config.vae_key_prefix}, filter_keys=True)
             vae_sd = model_config.process_vae_state_dict(vae_sd)
-            
-            # The VAE class itself respects the mm.get_torch_device() patch
             vae = VAE(sd=vae_sd, metadata=metadata)
-            logger.info(f"VAE Loaded. Final device should be: {vae_target_device}")
 
-        # --- CLIP Loading Block ---
         if output_clip:
-            logger.info("--- Loading CLIP ---")
             clip_target_device = device_config.get('clip_device', original_clip_device)
             set_current_text_encoder_device(clip_target_device)
-            logger.info(f"Set CLIP context to: {clip_target_device}")
             
             clip_target = model_config.clip_target(state_dict=sd)
             if clip_target is not None: