test config

Kyle1668 · Kyle1668 · commit 9c18c3861795 · 2025-05-28T23:40:39.000-05:00
diff --git a/configs/poly_pythia/2-8B_Seed_1.yml b/configs/poly_pythia/2-8B_Seed_1.yml
@@ -0,0 +1,112 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  "num_layers": 32,
+  "hidden_size": 2560,
+  "num_attention_heads": 32,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+
+  "attention_config": [[["flash"], 32]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": true,
+
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00016,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8
+    }
+  },
+  "min_lr": 0.000016,
+
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "train_micro_batch_size_per_gpu": 8,
+  "gradient_accumulation_steps": 2,
+  "data_impl": "mmap",
+  "num_workers": 1,
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+
+  "train_iters": 143000,
+  "lr_decay_iters": 143000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 1000,
+  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
+  "eval_interval": 40000,
+  "eval_iters": 10,
+
+  "log_grad_norm": true,
+
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+
+  "tokenizer_type": "HFTokenizer",
+
+  # Run-Specific
+  "seed": 1,
+
+  # Test Dataset
+  "data_path": "/data/enwik8/enwik8_text_document",
+  "vocab_file": "/data/neox_tokenizer/tokenizer.json",
+
+  # Checkpoints
+  "save": "/checkpoints/2-8b-seed-1",
+  "load": "/checkpoints/2-8b-seed-1",
+  "checkpoint_validation_with_forward_pass": False,
+
+  # Wandb
+  "use_wandb": True,
+  "wandb_host": "https://api.wandb.ai",
+  "wandb_project": "pythia-extra-seeds",
+  "wandb_team": "eleutherai",
+  "wandb_run_name": "2-8b-seed-1-enwik8",
+
+  # Distributed Training
+  "hostfile": "/workspace/hostfile",
+  "deepspeed_mpi": True,
+  "launcher": "openmpi",
+  "deepspeed_extra_args": { "ssh_port": 2222 },
+}
diff --git a/configs/poly_pythia/2-8B_Seed_1_single_node_test.yml b/configs/poly_pythia/2-8B_Seed_1_single_node_test.yml
@@ -0,0 +1,111 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  "num_layers": 32,
+  "hidden_size": 2560,
+  "num_attention_heads": 32,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+
+  "attention_config": [[["flash"], 32]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": true,
+
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00016,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8
+    }
+  },
+  "min_lr": 0.000016,
+
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "train_micro_batch_size_per_gpu": 48,
+  "gradient_accumulation_steps": 1,
+  "data_impl": "mmap",
+  "num_workers": 1,
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+
+  "train_iters": 1000,
+  "lr_decay_iters": 1000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": null,
+  "eval_interval": 40000,
+  "eval_iters": 10,
+
+  "log_grad_norm": true,
+
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+
+  "tokenizer_type": "HFTokenizer",
+
+  # Run-Specific
+  "seed": 1,
+
+  # Test Dataset
+  "data_path": "/data/enwik8/enwik8_text_document",
+  "vocab_file": "/data/neox_tokenizer/tokenizer.json",
+
+  # Checkpoints
+  "save": null,
+  "load": null,
+  "checkpoint_validation_with_forward_pass": False,
+
+  # Wandb
+  "use_wandb": True,
+  "wandb_host": "https://api.wandb.ai",
+  "wandb_project": "pythia-extra-seeds",
+  "wandb_team": "eleutherai",
+  "wandb_run_name": "2-8b-seed-1-enwik8-single-node-test",
+
+  # Distributed Training
+  "hostfile": "/workspace/hostfile",
+  "deepspeed_mpi": True,
+  "launcher": "openmpi",
+  "deepspeed_extra_args": { "ssh_port": 2222 },
+}