OpenDriveLab
diff --git a/‎vla-scripts/extern/ego4d_rlds_dataset_builder‎
Lines changed: 0 additions & 1 deletion b/‎vla-scripts/extern/ego4d_rlds_dataset_builder‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎vla-scripts/extern/ego4d_rlds_dataset_builder/LICENSE‎
Lines changed: 21 additions & 0 deletions b/‎vla-scripts/extern/ego4d_rlds_dataset_builder/LICENSE‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎vla-scripts/extern/ego4d_rlds_dataset_builder/create_episode_ego4d.py‎
Lines changed: 192 additions & 0 deletions b/‎vla-scripts/extern/ego4d_rlds_dataset_builder/create_episode_ego4d.py‎
Lines changed: 192 additions & 0 deletions
diff --git a/‎vla-scripts/extern/ego4d_rlds_dataset_builder/ego4d/CITATIONS.bib‎
Lines changed: 1 addition & 0 deletions b/‎vla-scripts/extern/ego4d_rlds_dataset_builder/ego4d/CITATIONS.bib‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vla-scripts/extern/ego4d_rlds_dataset_builder/ego4d/README.md‎
Lines changed: 91 additions & 0 deletions b/‎vla-scripts/extern/ego4d_rlds_dataset_builder/ego4d/README.md‎
Lines changed: 91 additions & 0 deletions
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Karl Pertsch
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,192 @@
+import numpy as np
+from PIL import Image
+import os
+import json
+from tqdm import tqdm
+from multiprocessing import Pool
+from functools import partial
+import argparse
+
+def parse_arguments():
+    """Parse command line arguments for the Ego4D data processing script."""
+    parser = argparse.ArgumentParser(description='Process Ego4D data to create fake episodes.')
+    
+    parser.add_argument('--source_dir', type=str, required=True,
+                       help='Directory containing the source video clips')
+    parser.add_argument('--target_dir', type=str, required=True,
+                       help='Directory to save the processed episodes')
+    parser.add_argument('--annotation_file', type=str, required=True,
+                       help='Path to the annotation JSON file')
+    parser.add_argument('--processes', type=int, default=96,
+                       help='Number of worker processes to use (default: 96)')
+    parser.add_argument('--target_size', type=int, nargs=2, default=[224, 224],
+                       help='Target size for resizing images as "height width" (default: 224 224)')
+    parser.add_argument('--verify', action='store_true',
+                       help='Verify saved episodes by loading them after creation')
+    
+    return parser.parse_args()
+
+def center_crop_and_resize(image, target_size=(224, 224)):
+    """
+    Center crop and resize the input image while maintaining aspect ratio.
+    
+    Args:
+        image (numpy.ndarray): Input image array with shape (H, W, C)
+        target_size (tuple): Desired output size as (height, width)
+        
+    Returns:
+        numpy.ndarray: Resized image array with shape (target_height, target_width, C)
+    """
+    height, width, _ = image.shape
+
+    # Determine which dimension to crop (the longer side)
+    if height < width:
+        # Landscape image - crop width
+        crop_size = height
+        start_x = (width - crop_size) // 2
+        start_y = 0
+    else:
+        # Portrait image - crop height
+        crop_size = width
+        start_x = 0
+        start_y = (height - crop_size) // 2
+
+    # Perform center crop
+    cropped_image = image[start_y:start_y + crop_size, start_x:start_x + crop_size, :]
+
+    # Convert to PIL Image for high-quality resizing
+    pil_image = Image.fromarray(cropped_image)
+    resized_image = pil_image.resize(target_size, Image.BILINEAR)
+
+    return np.array(resized_image)
+
+def create_fake_episode(clip_dir, save_dir, annotation, target_size, verify=False):
+    """
+    Create a fake episode from a video clip by processing all frames.
+    
+    Args:
+        clip_dir (str): Path to directory containing clip frames
+        save_dir (str): Directory to save the output episode
+        annotation (list): List of annotation dictionaries
+        target_size (tuple): Target size for frame resizing
+        verify (bool): Whether to verify the saved episode
+        
+    Returns:
+        None (saves episode to disk as .npy file)
+    """
+    episode_data = []
+    clip_name = os.path.basename(clip_dir)
+    video_name = os.path.basename(os.path.dirname(clip_dir))
+
+    # Find matching annotation for this clip
+    caption = None
+    episode_id = None
+    for anno in annotation:
+        if anno['video_name'] == video_name and anno['action_name'] == clip_name:
+            caption = anno['language'][5:]  # Remove first 5 characters '#C C '
+            episode_id = anno['id'] - 1
+            break
+    
+    if caption is None or episode_id is None:
+        print(f"No matching annotation found for {video_name}/{clip_name}")
+        return
+
+    save_path = os.path.join(save_dir, f'episode_{episode_id}.npy')
+
+    # Process each frame in the clip
+    for frame_name in sorted(os.listdir(clip_dir)):
+        frame_path = os.path.join(clip_dir, frame_name)
+        try:
+            frame = np.load(frame_path)
+            frame = frame[:, :, ::-1]  # Convert BGR to RGB
+            frame = center_crop_and_resize(frame, target_size)
+            
+            episode_data.append({
+                'image': np.asarray(frame, dtype=np.uint8),
+                'wrist_image': np.asarray(np.zeros([1, 1, 1]), dtype=np.uint8),
+                'state': np.asarray(np.zeros(7), dtype=np.float32),
+                'action': np.asarray(np.zeros(7), dtype=np.float32),
+                'language_instruction': caption,
+            })
+        except Exception as e:
+            print(f"Error processing frame {frame_path}: {str(e)}")
+            continue
+    
+    # Save the episode data
+    np.save(save_path, episode_data)
+    
+    # Optional verification step
+    if verify:
+        try:
+            loaded_data = np.load(save_path, allow_pickle=True)
+            if len(loaded_data) == 0:
+                print(f"Warning: Empty episode saved at {save_path}")
+        except Exception as e:
+            print(f"Failed to verify saved episode {episode_id}: {str(e)}")
+
+def process_video(video_dir, target_dir, annotation, target_size, verify=False):
+    """
+    Process all clips within a single video directory.
+    
+    Args:
+        video_dir (str): Path to video directory containing clips
+        target_dir (str): Directory to save processed episodes
+        annotation (list): List of annotation dictionaries
+        target_size (tuple): Target size for frame resizing
+        verify (bool): Whether to verify saved episodes
+    """
+    for clip_name in sorted(os.listdir(video_dir)):
+        clip_dir = os.path.join(video_dir, clip_name)
+        create_fake_episode(
+            clip_dir=clip_dir,
+            save_dir=target_dir,
+            annotation=annotation,
+            target_size=target_size,
+            verify=verify
+        )
+
+def main():
+    args = parse_arguments()
+    
+    # Create target directory if it doesn't exist
+    os.makedirs(args.target_dir, exist_ok=True)
+    
+    # Load annotation file
+    print("Loading annotation file...")
+    try:
+        with open(args.annotation_file) as f:
+            annotation = json.load(f)
+    except Exception as e:
+        print(f"Failed to load annotation file: {str(e)}")
+        return
+    
+    # Get list of video directories
+    video_dirs = [
+        os.path.join(args.source_dir, d) 
+        for d in sorted(os.listdir(args.source_dir)) 
+        if os.path.isdir(os.path.join(args.source_dir, d))
+    ]
+    
+    print(f"Processing {len(video_dirs)} videos using {args.processes} workers...")
+    
+    # Process videos in parallel
+    with Pool(processes=args.processes) as pool:
+        process_func = partial(
+            process_video,
+            target_dir=args.target_dir,
+            annotation=annotation,
+            target_size=tuple(args.target_size),
+            verify=args.verify
+        )
+        
+        # Process with progress bar
+        results = list(tqdm(
+            pool.imap_unordered(process_func, video_dirs),
+            total=len(video_dirs),
+            desc='Processing videos'
+        ))
+    
+    print('Ego4D data processing completed successfully!')
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1 @@
+// TODO(ego4d): BibTeX citation
@@ -0,0 +1,91 @@
+## Converting Ego4D dataset to RLDS
+
+
+#### Step.0 Prepare Pre-training Dataset
+Download [Ego4D](https://ego4d-data.org/docs/start-here/) Hand-and-Object dataset:
+```
+# Download the CLI
+pip install ego4d
+# Select Subset Of Hand-and-Object
+python -m ego4d.cli.cli --output_directory=<path-to-save-dir> --datasets clips annotations  --metadata --version v2 --benchmarks FHO
+```
+
+Your directory tree should look like this: 
+```
+$<path-to-ego4d-save-dir>
+├── ego4d.json
+└── v2
+    |—— annotations  
+    └── clips
+```
+
+
+#### :one: Install necessary dependencies
+
+First create a conda environment using the provided environment.yml file (use `environment_ubuntu.yml` or `environment_macos.yml` depending on the operating system you're using):
+```
+conda env create -f environment_ubuntu.yml
+```
+
+Then activate the environment using:
+```
+conda activate rlds_env
+cd vla-scripts/extern/ego4d_rlds_dataset_builder/ego4d
+pip install -e .
+```
+
+Then, download all necessary dependencies form [huggingface](https://huggingface.co/datasets/qwbu/univla-ego4d-rlds-dependencies) and put them under ```vla-scripts/extern/ego4d_rlds_dataset_builder```.
+
+
+#### :two: We first extract the interaction frames (video clips within ```pre_frame``` and ```post_frame```) with a FPS of 2 and save them as ```.npy``` files.
+
+We first process the citical information about the interaction clips and key frames (```pre_frame```, ```pnr_frame```, and ```post_frame```) into a json file (```info_clips.json```) with [this script](https://github.com/OpenDriveLab/MPI/blob/79798d0d6c40919adcf3263c6df7e86758fdd59a/prepare_dataset.py), or you can directly download the json file from [here](https://huggingface.co/datasets/qwbu/univla-ego4d-rlds-dependencies).
+
+```bash
+python preprocess_ego4d.py \
+    --denseclips_dir /path/to/output/denseclips \           # output dir for processed clips
+    --info_clips_json /path/to/info_clips.json \            # metadata of keyframes
+    --source_videos_dir <path-to-ego4d-save-dir>/v2/clips \       # ego4d videos path     
+    --frame_interval 15                                     # downsample Ego4D to 2 fps
+```
+
+
+#### :three: We then create episodes with according to desirable format with:
+
+```bash
+mkdir ../ego4d_rlds_dataset_builder/ego4d/data
+mkdir ../ego4d_rlds_dataset_builder/ego4d/data/train
+
+python create_episode_ego4d.py \
+    --source_dir /path/to/output/denseclips \                       # processed clips from the step.2
+    --target_dir ../ego4d_rlds_dataset_builder/ego4d/data/train \     # path to save episodes
+    --annotation_file /path/to/output/denseclips/annotations.json \ # processed meta-info from step.2
+    --processes 64                                                  # multi-processing
+```
+
+#### :four: Create ego4d rlds dataset
+
+```bash
+cd vla-scripts/extern/ego4d_rlds_dataset_builder/ego4d
+tfds build --overwrite --beam_pipeline_options="direct_running_mode=multi_processing,direct_num_workers=16"
+```
+
+The default save path for the dataset is `/root/tensorflow_datasets/ego4d_dataset`. Directly process the whole dataset may face memory limit issue, we can split the dataset into several parts and process them separately:
+
+```bash
+cd vla-scripts/extern/ego4d_rlds_dataset_builder/ego4d
+mkdir data/val
+rsync -av --files-from=<(printf "episode_%d.npy\n" {0000..9999}) data/train/ data/val/  
+tfds build --overwrite --beam_pipeline_options="direct_running_mode=multi_processing,direct_num_workers=4"
+mkdir /root/tensorflow_datasets/ego4d_dataset/ego4d_split_1
+mv /root/tensorflow_datasets/ego4d_dataset/1.0.0 /root/tensorflow_datasets/ego4d_dataset/ego4d_split_1/1.0.0
+rm -r data/val
+
+rsync -av --files-from=<(printf "episode_%d.npy\n" {10000..19999}) data/train/ data/val/  
+tfds build --overwrite --beam_pipeline_options="direct_running_mode=multi_processing,direct_num_workers=4"
+mkdir /root/tensorflow_datasets/ego4d_dataset/ego4d_split_2
+mv /root/tensorflow_datasets/ego4d_dataset/1.0.0 /root/tensorflow_datasets/ego4d_dataset/ego4d_split_2/1.0.0
+rm -r data/val
+
+# repeat until all data is processed
+```