deepspeedai · lekurile · Aug 15, 2023 · Aug 8, 2023 · Aug 8, 2023 · Aug 8, 2023
@@ -331,7 +331,9 @@ def parse_args():
         "--test_stop_step",
         type=int,
         default=0,
-        help="Training step at which to terminate training during testing.")
+        help=
+        "Training non-overflow step at which to terminate training during testing."
+    )
 
     parser = deepspeed.add_config_arguments(parser)
     args = parser.parse_args()
@@ -461,6 +463,8 @@ def main():
     # Train!
     print_rank_0("***** Running training *****", args.global_rank)
 
+    non_overflow_step_count = 0
+
     for epoch in range(args.num_train_epochs):
         print_rank_0(
             f"Beginning of Epoch {epoch+1}/{args.num_train_epochs}, Total Generation Batches {min(len(prompt_train_dataloader), len(unsupervised_train_dataloader))}",
@@ -547,7 +551,12 @@ def main():
             if args.actor_gradient_checkpointing:
                 rlhf_engine.actor.gradient_checkpointing_disable()
 
-            if args.enable_test_mode and step == args.test_stop_step:
+            actor_overflow, critic_overflow = trainer.get_overflow()
+
+            if not actor_overflow and not critic_overflow:
+                non_overflow_step_count += 1
+
+            if args.enable_test_mode and non_overflow_step_count == args.test_stop_step:
                 break
 
         if args.enable_test_mode:

@@ -235,6 +235,12 @@ def train_rlhf(self, inputs):
 
         return actor_loss, critic_loss
 
+    def get_overflow(self):
+        actor_overflow = self.actor_model.optimizer.overflow
+        critic_overflow = self.critic_model.optimizer.overflow
+
+        return actor_overflow, critic_overflow
+
     def actor_loss_fn(self, logprobs, old_logprobs, advantages, mask):
         ## policy gradient loss
         log_ratio = (logprobs - old_logprobs) * mask

@@ -11,6 +11,8 @@ ENABLE_HYBRID_ENGINE=$5
 OFFLOAD=$6
 LORA=$7
 OUTPUT=$8
+TEST=$9
+TEST_STOP_STEP=${10}
 
 if [ "$ACTOR_ZERO_STAGE" == "" ]; then
     ACTOR_ZERO_STAGE=2
@@ -40,6 +42,14 @@ else
     ACTOR_LORA_MODULE_NAME=""
 fi
 
+if [ "$TEST" == true ]; then
+    TEST="--enable_test_mode"
+    TEST_STOP_STEP="--test_stop_step ${TEST_STOP_STEP}"
+else
+    TEST=""
+    TEST_STOP_STEP=""
+fi
+
 mkdir -p $OUTPUT
 
 Num_Padding_at_Beginning=1 # this is model related
@@ -74,7 +84,8 @@ cmd="deepspeed --num_nodes=1 main.py \
    --critic_zero_stage ${CRITIC_ZERO_STAGE} \
    --output_dir $OUTPUT \
     $ENABLE_HYBRID_ENGINE $OFFLOAD $UNPIN_ACTOR_PARAMETERS \
-    $ACTOR_LORA_DIM $ACTOR_LORA_MODULE_NAME"
+    $ACTOR_LORA_DIM $ACTOR_LORA_MODULE_NAME\
+    $TEST $TEST_STOP_STEP"
 
 echo "----------------------------- DS COMMAND -----------------------------"
 echo $cmd

@@ -0,0 +1,88 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import os
+import subprocess
+
+
+def file_exists(directory_path, file_name):
+    return os.path.isfile(os.path.join(directory_path, file_name))
+
+
+@pytest.fixture(params=["2", "3"])
+def zero_stage(request):
+    return str(request.param)
+
+
+@pytest.fixture(params=["true", "false"])
+def hybrid_engine(request):
+    return str(request.param)
+
+
+@pytest.fixture(params=["true", "false"])
+def offload(request):
+    return str(request.param)
+
+
+@pytest.fixture(params=["true", "false"])
+def lora(request):
+    return str(request.param)
+
+
+def test_ds_chat(zero_stage, hybrid_engine, offload, lora):
+    critic_ckpt_dir = os.getenv("CRITIC_CKPT_DIR")
+    assert critic_ckpt_dir, "Please set CRITIC_CKPT_DIR in your environment"
+
+    actor_model = "facebook/opt-125m"
+    critic_model = critic_ckpt_dir
+    output_path = "z" + zero_stage + "_he_" + hybrid_engine + "_offload_" + offload + "_lora_" + lora
+    enable_test_mode = "true"
+    test_stop_step = "5"
+    params = [
+        actor_model,
+        critic_model,
+        zero_stage,
+        zero_stage,
+        hybrid_engine,
+        offload,
+        lora,
+        output_path,
+        enable_test_mode,
+        test_stop_step,
+    ]
+
+    # Skip certain combinations
+    if zero_stage == "2" and hybrid_engine == "true" and offload == "true" and lora == "false":
+        pytest.skip(
+            "The combination of [actor_zero_stage==2, critic_zero_stage==2, enable_hybrid_engine=True, offload=True, lora=False] is currently unsupported due to training instability!"
+        )
+
+    if zero_stage == "3" and hybrid_engine == "true" and offload == "true" and lora == "true":
+        pytest.skip(
+            "The combination of [actor_zero_stage==3, critic_zero_stage==3, enable_hybrid_engine=True, offload=True, lora=True] is currently unsupported due to training instability!"
+        )
+
+    # cd into execution dir
+    wd = os.getcwd()
+    os.chdir("../step3_rlhf_finetuning")
+    sweep_script = "training_scripts/opt/single_node/sweep/run_single.sh"
+
+    # Run bash script
+    cmd = ["bash", sweep_script] + params
+    result = subprocess.run(cmd)
+
+    # Assertions
+    try:
+        result.check_returncode()
+    except subprocess.CalledProcessError as e:
+        with open(os.path.join(output_path, f"{output_path}.log"), "r") as f:
+            print(f.read())
+        raise e
+
+    assert file_exists(f"{output_path}/actor/", "pytorch_model.bin"
+                       ), "Actor model was not saved during step 3 training."
+    assert file_exists(f"{output_path}/critic/", "pytorch_model.bin"
+                       ), "Critic model was not saved during step 3 training."