@@ -210,7 +210,7 @@ def train_mock() -> run.Partial:
210
210
recipe .data .model_config = recipe .model .config
211
211
recipe .log .log_dir = 'nemo_experiments/train_mock'
212
212
213
- recipe .trainer .strategy .ddp .use_custom_fsdp = True
213
+ recipe .trainer .strategy .ddp .use_megatron_fsdp = True
214
214
recipe .trainer .strategy .ddp .data_parallel_sharding_strategy = 'optim_grads_params'
215
215
recipe .trainer .strategy .ddp .overlap_param_gather = True
216
216
recipe .trainer .strategy .ddp .overlap_grad_reduce = True
@@ -236,7 +236,7 @@ def mock_ditllama5b_8k() -> run.Partial:
236
236
recipe .data .model_config = recipe .model .config
237
237
recipe .log .log_dir = 'nemo_experiments/mock_ditllama5b_8k'
238
238
recipe .model .config .attn_mask_type = AttnMaskType .no_mask
239
- recipe .trainer .strategy .ddp .use_custom_fsdp = True
239
+ recipe .trainer .strategy .ddp .use_megatron_fsdp = True
240
240
recipe .trainer .strategy .ddp .data_parallel_sharding_strategy = 'optim_grads_params'
241
241
recipe .trainer .strategy .ddp .overlap_param_gather = True
242
242
recipe .trainer .strategy .ddp .overlap_grad_reduce = True
@@ -360,7 +360,7 @@ def pretrain_ditllama30b() -> run.Partial:
360
360
recipe .data .task_encoder .seq_length = 256
361
361
recipe .data .virtual_epoch_length = 0
362
362
recipe .log .log_dir = 'nemo_experiments/ditllama30b_stage1_mock'
363
- recipe .trainer .strategy .ddp .use_custom_fsdp = True
363
+ recipe .trainer .strategy .ddp .use_megatron_fsdp = True
364
364
recipe .trainer .strategy .ddp .data_parallel_sharding_strategy = 'optim_grads_params'
365
365
recipe .trainer .strategy .ddp .overlap_param_gather = True
366
366
recipe .trainer .strategy .ddp .overlap_grad_reduce = True
@@ -386,7 +386,7 @@ def pretrain_ditllama30b_stage2_mock() -> run.Partial:
386
386
recipe .trainer .val_check_interval = 1.0
387
387
recipe .data .model_config = recipe .model .config
388
388
recipe .log .log_dir = 'nemo_experiments/ditllama30b_stage2_mock'
389
- recipe .trainer .strategy .ddp .use_custom_fsdp = True
389
+ recipe .trainer .strategy .ddp .use_megatron_fsdp = True
390
390
recipe .trainer .strategy .ddp .data_parallel_sharding_strategy = 'optim_grads_params'
391
391
recipe .trainer .strategy .ddp .overlap_param_gather = True
392
392
recipe .trainer .strategy .ddp .overlap_grad_reduce = True
@@ -412,7 +412,7 @@ def pretrain_ditllama30b_stage3_mock() -> run.Partial:
412
412
recipe .trainer .val_check_interval = 1.0
413
413
recipe .data .model_config = recipe .model .config
414
414
recipe .log .log_dir = 'nemo_experiments/ditllama30b_stage3_mock'
415
- recipe .trainer .strategy .ddp .use_custom_fsdp = True
415
+ recipe .trainer .strategy .ddp .use_megatron_fsdp = True
416
416
recipe .trainer .strategy .ddp .data_parallel_sharding_strategy = 'optim_grads_params'
417
417
recipe .trainer .strategy .ddp .overlap_param_gather = True
418
418
recipe .trainer .strategy .ddp .overlap_grad_reduce = True
@@ -512,7 +512,7 @@ def pretrain_ecditllama1b() -> run.Partial:
512
512
recipe .log .log_dir = 'nemo_experiments/ecditllama1b'
513
513
recipe .trainer .val_check_interval = 3000
514
514
515
- recipe .trainer .strategy .ddp .use_custom_fsdp = True
515
+ recipe .trainer .strategy .ddp .use_megatron_fsdp = True
516
516
recipe .trainer .strategy .ddp .data_parallel_sharding_strategy = 'optim_grads_params'
517
517
recipe .trainer .strategy .ddp .overlap_param_gather = True
518
518
recipe .trainer .strategy .ddp .overlap_grad_reduce = True
0 commit comments