Coverted WanVideoImageToVideoEncode to shim

pollockjj · pollockjj · commit b04ae1e3787d · 2025-10-09T12:32:29.000-05:00
diff --git a/wanvideo.py b/wanvideo.py
@@ -472,156 +472,27 @@ def process(self, width, height, num_frames, force_offload, noise_aug_strength,
                     temporal_mask=None, extra_latents=None, clip_embeds=None, tiled_vae=False, add_cond_latents=None, vae=None, load_device=None):
         from . import set_current_device
 
+        original_encoder = NODE_CLASS_MAPPINGS["WanVideoImageToVideoEncode"]()
+        encoder_module = inspect.getmodule(original_encoder)
+
+        original_module_device = encoder_module.device
+        original_module_offload = encoder_module.offload_device
+
         set_current_device(load_device)
-        
-        logger.info(f"[MultiGPU WanVideoWrapper][WanVideoImageToVideoEncodeMultiGPU] load device: {load_device}")
 
-        device = mm.get_torch_device()
-        PATCH_SIZE = (1, 2, 2)
-        offload_device = mm.unet_offload_device()
+        compute_device_to_be_patched = mm.get_torch_device()
+        encoder_module.device = compute_device_to_be_patched
 
-        logger.info(f"[MultiGPU WanVideoWrapper][WanVideoImageToVideoEncodeMultiGPU] torch device: {device}")
+        encoder_module.offload_device = mm.unet_offload_device()
 
-        if vae is not None:
-            vae = vae[0]
-        
-        if start_image is None and end_image is None and add_cond_latents is None:
-            return WanVideoEmptyEmbeds().process(
-                num_frames, width, height, control_embeds=control_embeds, extra_latents=extra_latents,
-            )
-        if vae is None:
-            raise ValueError("VAE is required for image encoding.")
-        H = height
-        W = width
-        
-        lat_h = H // vae.upsampling_factor
-        lat_w = W // vae.upsampling_factor
-
-        num_frames = ((num_frames - 1) // 4) * 4 + 1
-        two_ref_images = start_image is not None and end_image is not None
-
-        if start_image is None and end_image is not None:
-            fun_or_fl2v_model = True # end image alone only works with this option
-
-        base_frames = num_frames + (1 if two_ref_images and not fun_or_fl2v_model else 0)
-        if temporal_mask is None:
-            mask = torch.zeros(1, base_frames, lat_h, lat_w, device=device, dtype=vae.dtype)
-            if start_image is not None:
-                mask[:, 0:start_image.shape[0]] = 1  # First frame
-            if end_image is not None:
-                mask[:, -end_image.shape[0]:] = 1  # End frame if exists
-        else:
-            mask = common_upscale(temporal_mask.unsqueeze(1).to(device), lat_w, lat_h, "nearest", "disabled").squeeze(1)
-            if mask.shape[0] > base_frames:
-                mask = mask[:base_frames]
-            elif mask.shape[0] < base_frames:
-                mask = torch.cat([mask, torch.zeros(base_frames - mask.shape[0], lat_h, lat_w, device=device)])
-            mask = mask.unsqueeze(0).to(device, vae.dtype)
-
-        # Repeat first frame and optionally end frame
-        start_mask_repeated = torch.repeat_interleave(mask[:, 0:1], repeats=4, dim=1) # T, C, H, W
-        if end_image is not None and not fun_or_fl2v_model:
-            end_mask_repeated = torch.repeat_interleave(mask[:, -1:], repeats=4, dim=1) # T, C, H, W
-            mask = torch.cat([start_mask_repeated, mask[:, 1:-1], end_mask_repeated], dim=1)
-        else:
-            mask = torch.cat([start_mask_repeated, mask[:, 1:]], dim=1)
-
-        # Reshape mask into groups of 4 frames
-        mask = mask.view(1, mask.shape[1] // 4, 4, lat_h, lat_w) # 1, T, C, H, W
-        mask = mask.movedim(1, 2)[0]# C, T, H, W
-
-        # Resize and rearrange the input image dimensions
-        if start_image is not None:
-            start_image = start_image[..., :3]
-            if start_image.shape[1] != H or start_image.shape[2] != W:
-                resized_start_image = common_upscale(start_image.movedim(-1, 1), W, H, "lanczos", "disabled").movedim(0, 1)
-            else:
-                resized_start_image = start_image.permute(3, 0, 1, 2) # C, T, H, W
-            resized_start_image = resized_start_image * 2 - 1
-            if noise_aug_strength > 0.0:
-                resized_start_image = add_noise_to_reference_video(resized_start_image, ratio=noise_aug_strength)
-        
-        if end_image is not None:
-            end_image = end_image[..., :3]
-            if end_image.shape[1] != H or end_image.shape[2] != W:
-                resized_end_image = common_upscale(end_image.movedim(-1, 1), W, H, "lanczos", "disabled").movedim(0, 1)
-            else:
-                resized_end_image = end_image.permute(3, 0, 1, 2) # C, T, H, W
-            resized_end_image = resized_end_image * 2 - 1
-            if noise_aug_strength > 0.0:
-                resized_end_image = add_noise_to_reference_video(resized_end_image, ratio=noise_aug_strength)
-            
-        # Concatenate image with zero frames and encode
-        if temporal_mask is None:
-            if start_image is not None and end_image is None:
-                zero_frames = torch.zeros(3, num_frames-start_image.shape[0], H, W, device=device, dtype=vae.dtype)
-                concatenated = torch.cat([resized_start_image.to(device, dtype=vae.dtype), zero_frames], dim=1)
-                del resized_start_image, zero_frames
-            elif start_image is None and end_image is not None:
-                zero_frames = torch.zeros(3, num_frames-end_image.shape[0], H, W, device=device, dtype=vae.dtype)
-                concatenated = torch.cat([zero_frames, resized_end_image.to(device, dtype=vae.dtype)], dim=1)
-                del zero_frames
-            elif start_image is None and end_image is None:
-                concatenated = torch.zeros(3, num_frames, H, W, device=device, dtype=vae.dtype)
-            else:
-                if fun_or_fl2v_model:
-                    zero_frames = torch.zeros(3, num_frames-(start_image.shape[0]+end_image.shape[0]), H, W, device=device, dtype=vae.dtype)
-                else:
-                    zero_frames = torch.zeros(3, num_frames-1, H, W, device=device, dtype=vae.dtype)
-                concatenated = torch.cat([resized_start_image.to(device, dtype=vae.dtype), zero_frames, resized_end_image.to(device, dtype=vae.dtype)], dim=1)
-                del resized_start_image, zero_frames
-        else:
-            temporal_mask = common_upscale(temporal_mask.unsqueeze(1), W, H, "nearest", "disabled").squeeze(1)
-            concatenated = resized_start_image[:,:num_frames].to(vae.dtype) * temporal_mask[:num_frames].unsqueeze(0).to(vae.dtype)
-            del resized_start_image, temporal_mask
-
-        mm.soft_empty_cache()
-        gc.collect()
-
-        vae.to(device)
-        y = vae.encode([concatenated], device, end_=(end_image is not None and not fun_or_fl2v_model),tiled=tiled_vae)[0]
-        del concatenated
-
-        has_ref = False
-        if extra_latents is not None:
-            samples = extra_latents["samples"].squeeze(0)
-            y = torch.cat([samples, y], dim=1)
-            mask = torch.cat([torch.ones_like(mask[:, 0:samples.shape[1]]), mask], dim=1)
-            num_frames += samples.shape[1] * 4
-            has_ref = True
-        y[:, :1] *= start_latent_strength
-        y[:, -1:] *= end_latent_strength
-
-        # Calculate maximum sequence length
-        patches_per_frame = lat_h * lat_w // (PATCH_SIZE[1] * PATCH_SIZE[2])
-        frames_per_stride = (num_frames - 1) // 4 + (2 if end_image is not None and not fun_or_fl2v_model else 1)
-        max_seq_len = frames_per_stride * patches_per_frame
-
-        if add_cond_latents is not None:
-            add_cond_latents["ref_latent_neg"] = vae.encode(torch.zeros(1, 3, 1, H, W, device=device, dtype=vae.dtype), device)
-        
-        if force_offload:
-            vae.model.to(offload_device)
-            mm.soft_empty_cache()
-            gc.collect()
-
-        image_embeds = {
-            "image_embeds": y,
-            "clip_context": clip_embeds.get("clip_embeds", None) if clip_embeds is not None else None,
-            "negative_clip_context": clip_embeds.get("negative_clip_embeds", None) if clip_embeds is not None else None,
-            "max_seq_len": max_seq_len,
-            "num_frames": num_frames,
-            "lat_h": lat_h,
-            "lat_w": lat_w,
-            "control_embeds": control_embeds["control_embeds"] if control_embeds is not None else None,
-            "end_image": resized_end_image if end_image is not None else None,
-            "fun_or_fl2v_model": fun_or_fl2v_model,
-            "has_ref": has_ref,
-            "add_cond_latents": add_cond_latents,
-            "mask": mask
-        }
+        inner_vae = vae[0]
 
-        return (image_embeds,)
+        try:
+            return original_encoder.process(width, height, num_frames, force_offload, noise_aug_strength, start_latent_strength, end_latent_strength, start_image,
+                end_image, control_embeds, fun_or_fl2v_model, temporal_mask, extra_latents, clip_embeds, tiled_vae, add_cond_latents, inner_vae,)
+        finally:
+            encoder_module.device = original_module_device
+            encoder_module.offload_device = original_module_offload
 
 class WanVideoDecode:
     @classmethod