|
86 | 86 | from verl.utils.profiler.performance import reduce_timing, topk_reduce_ratio_min_max
|
87 | 87 | from verl.utils.py_functional import convert_to_regular_types
|
88 | 88 | from verl.workers.config import FSDPCriticConfig, FSDPEngineConfig, HFModelConfig, RolloutConfig
|
| 89 | +from verl.workers.config.optimizer import build_optimizer |
89 | 90 | from verl.workers.rollout import get_rollout_class
|
90 | 91 | from verl.workers.sharding_manager.fsdp_ulysses import FSDPUlyssesShardingManager
|
91 | 92 |
|
@@ -279,7 +280,6 @@ def _build_model_optimizer(
|
279 | 280 | role="actor",
|
280 | 281 | enable_activation_offload=False,
|
281 | 282 | ):
|
282 |
| - from torch import optim |
283 | 283 | from torch.distributed.fsdp import CPUOffload, MixedPrecision
|
284 | 284 | from transformers import (
|
285 | 285 | AutoConfig,
|
@@ -520,12 +520,7 @@ def _build_model_optimizer(
|
520 | 520 | if role == "actor" and optim_config is not None:
|
521 | 521 | from verl.utils.torch_functional import get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup
|
522 | 522 |
|
523 |
| - actor_optimizer = optim.AdamW( |
524 |
| - actor_module_fsdp.parameters(), |
525 |
| - lr=optim_config.lr, |
526 |
| - betas=optim_config.get("betas", (0.9, 0.999)), |
527 |
| - weight_decay=optim_config.get("weight_decay", 1e-2), |
528 |
| - ) |
| 523 | + actor_optimizer = build_optimizer(actor_module_fsdp.parameters(), optim_config) |
529 | 524 |
|
530 | 525 | total_steps = optim_config.get("total_training_steps", 0)
|
531 | 526 | num_warmup_steps = int(optim_config.get("lr_warmup_steps", -1))
|
@@ -866,7 +861,7 @@ def update_actor(self, data: DataProto):
|
866 | 861 | metrics["perf/cpu_memory_used_gb"] = psutil.virtual_memory().used / (1024**3)
|
867 | 862 |
|
868 | 863 | lr = self.actor_lr_scheduler.get_last_lr()[0]
|
869 |
| - metrics["actor/lr"] = lr |
| 864 | + metrics["actor/lr"] = lr.item() if torch.is_tensor(lr) else lr |
870 | 865 | self.actor_lr_scheduler.step()
|
871 | 866 |
|
872 | 867 | # TODO: here, we should return all metrics
|
@@ -1187,7 +1182,6 @@ def __init__(self, config: FSDPCriticConfig):
|
1187 | 1182 |
|
1188 | 1183 | def _build_critic_model_optimizer(self, config):
|
1189 | 1184 | # the following line is necessary
|
1190 |
| - from torch import optim |
1191 | 1185 | from torch.distributed.fsdp import MixedPrecision
|
1192 | 1186 |
|
1193 | 1187 | from verl.utils.model import load_valuehead_model, print_model_size
|
@@ -1368,12 +1362,7 @@ def _build_critic_model_optimizer(self, config):
|
1368 | 1362 |
|
1369 | 1363 | log_gpu_memory_usage("After critic FSDP", logger=None)
|
1370 | 1364 |
|
1371 |
| - critic_optimizer = optim.AdamW( |
1372 |
| - critic_module.parameters(), |
1373 |
| - lr=config.optim.lr, |
1374 |
| - betas=config.optim.get("betas", (0.9, 0.999)), |
1375 |
| - weight_decay=config.optim.get("weight_decay", 1e-2), |
1376 |
| - ) |
| 1365 | + critic_optimizer = build_optimizer(critic_module.parameters(), config.optim) |
1377 | 1366 |
|
1378 | 1367 | total_steps = config.optim.get("total_training_steps", 0)
|
1379 | 1368 | num_warmup_steps = int(config.optim.get("lr_warmup_steps", -1))
|
|
0 commit comments