-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathconfig_g2rpo.yaml
More file actions
110 lines (104 loc) · 2.94 KB
/
config_g2rpo.yaml
File metadata and controls
110 lines (104 loc) · 2.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
data:
train_files: hiyouga/math12k@train
val_files: ""
test_files: []
prompt_key: problem
answer_key: answer
image_key: images
video_key: videos
image_dir: null
val_image_dir: null
video_fps: 2.0
max_prompt_length: 16384
max_response_length: 4096
rollout_batch_size: 512 # equivalent to verl's data.train_batch_size
mini_rollout_batch_size: null # equivalent to verl's data.gen_batch_size
val_batch_size: 1024
format_prompt: examples/format_prompt/math_qwen3.jinja
override_chat_template: null
shuffle: true
seed: 1
min_pixels: 3136
max_pixels: 1048576
filter_overlong_prompts: false
algorithm:
adv_estimator: gs_grpo
disable_kl: true
use_kl_loss: true
use_entropy_loss: false
kl_penalty: low_var_kl
kl_coef: 1.0e-2
online_filtering: true # dapo filter groups
filter_key: accuracy
filter_low: 0.01
filter_high: 0.99
worker:
actor:
global_batch_size: 128 # equivalent to verl's actor.ppo_mini_batch_size
micro_batch_size_per_device_for_update: 1 # equivalent to verl's actor.ppo_micro_batch_size_per_gpu
micro_batch_size_per_device_for_experience: 2 # equivalent to verl's rollout.log_prob_micro_batch_size_per_gpu
max_grad_norm: 1.0
padding_free: true
dynamic_batching: true
ulysses_size: 1
model:
model_path: Qwen/Qwen2.5-7B-Instruct
enable_gradient_checkpointing: true
trust_remote_code: false
freeze_vision_tower: false
optim:
lr: 5.0e-6
weight_decay: 1.0e-2
strategy: adamw # {adamw, adamw_bf16}
lr_warmup_ratio: 0.0
fsdp:
enable_full_shard: true
enable_cpu_offload: false
enable_rank0_init: true
offload:
offload_params: false # true: more CPU memory; false: more GPU memory
offload_optimizer: false # true: more CPU memory; false: more GPU memory
rollout:
n: 8
temperature: 1.0
top_p: 1.0
limit_images: 0
gpu_memory_utilization: 0.5
enforce_eager: false
enable_chunked_prefill: false
tensor_parallel_size: 2
disable_tqdm: true
max_num_batched_tokens: 20480
val_override_config:
temperature: 0.7
top_p: 0.95
n: 1
ref:
fsdp:
enable_full_shard: true
enable_cpu_offload: false # true: more CPU memory; false: more GPU memory
enable_rank0_init: true
offload:
offload_params: false
reward:
reward_type: batch
reward_function: verl/reward_function/multitask_reward.py:compute_score
trainer:
total_epochs: 1
max_steps: null
project_name: easy_r1
experiment_name: qwen2_5_7b_math_grpo
logger: ["file", "wandb"]
nnodes: 1
n_gpus_per_node: 8
max_try_make_batch: 20 # -1 means no limit
val_freq: 1 # -1 to disable
val_before_train: false
val_only: false
val_generations_to_log: 3
save_freq: 200 # -1 to disable
save_limit: 3 # -1 to disable
save_model_only: false
save_checkpoint_path: checkpoints/qwen3-8b-rl
load_checkpoint_path: null
find_last_checkpoint: true