Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,9 @@ def parse_args():
"--test_stop_step",
type=int,
default=0,
help="Training step at which to terminate training during testing.")
help=
"Training non-overflow step at which to terminate training during testing."
)

parser = deepspeed.add_config_arguments(parser)
args = parser.parse_args()
Expand Down Expand Up @@ -461,6 +463,8 @@ def main():
# Train!
print_rank_0("***** Running training *****", args.global_rank)

non_overflow_step_count = 0

for epoch in range(args.num_train_epochs):
print_rank_0(
f"Beginning of Epoch {epoch+1}/{args.num_train_epochs}, Total Generation Batches {min(len(prompt_train_dataloader), len(unsupervised_train_dataloader))}",
Expand Down Expand Up @@ -547,7 +551,12 @@ def main():
if args.actor_gradient_checkpointing:
rlhf_engine.actor.gradient_checkpointing_disable()

if args.enable_test_mode and step == args.test_stop_step:
actor_overflow, critic_overflow = trainer.get_overflow()

if not actor_overflow and not critic_overflow:
non_overflow_step_count += 1

if args.enable_test_mode and non_overflow_step_count == args.test_stop_step:
break

if args.enable_test_mode:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,12 @@ def train_rlhf(self, inputs):

return actor_loss, critic_loss

def get_overflow(self):
actor_overflow = self.actor_model.optimizer.overflow
critic_overflow = self.critic_model.optimizer.overflow

return actor_overflow, critic_overflow

def actor_loss_fn(self, logprobs, old_logprobs, advantages, mask):
## policy gradient loss
log_ratio = (logprobs - old_logprobs) * mask
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ ENABLE_HYBRID_ENGINE=$5
OFFLOAD=$6
LORA=$7
OUTPUT=$8
TEST=$9
TEST_STOP_STEP=${10}

if [ "$ACTOR_ZERO_STAGE" == "" ]; then
ACTOR_ZERO_STAGE=2
Expand Down Expand Up @@ -40,6 +42,14 @@ else
ACTOR_LORA_MODULE_NAME=""
fi

if [ "$TEST" == true ]; then
TEST="--enable_test_mode"
TEST_STOP_STEP="--test_stop_step ${TEST_STOP_STEP}"
else
TEST=""
TEST_STOP_STEP=""
fi

mkdir -p $OUTPUT

Num_Padding_at_Beginning=1 # this is model related
Expand Down Expand Up @@ -74,7 +84,8 @@ cmd="deepspeed --num_nodes=1 main.py \
--critic_zero_stage ${CRITIC_ZERO_STAGE} \
--output_dir $OUTPUT \
$ENABLE_HYBRID_ENGINE $OFFLOAD $UNPIN_ACTOR_PARAMETERS \
$ACTOR_LORA_DIM $ACTOR_LORA_MODULE_NAME"
$ACTOR_LORA_DIM $ACTOR_LORA_MODULE_NAME\
$TEST $TEST_STOP_STEP"

echo "----------------------------- DS COMMAND -----------------------------"
echo $cmd
Expand Down
88 changes: 88 additions & 0 deletions applications/DeepSpeed-Chat/training/tests/run_training_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

import pytest
import os
import subprocess


def file_exists(directory_path, file_name):
return os.path.isfile(os.path.join(directory_path, file_name))


@pytest.fixture(params=["2", "3"])
def zero_stage(request):
return str(request.param)


@pytest.fixture(params=["true", "false"])
def hybrid_engine(request):
return str(request.param)


@pytest.fixture(params=["true", "false"])
def offload(request):
return str(request.param)


@pytest.fixture(params=["true", "false"])
def lora(request):
return str(request.param)


def test_ds_chat(zero_stage, hybrid_engine, offload, lora):
critic_ckpt_dir = os.getenv("CRITIC_CKPT_DIR")
assert critic_ckpt_dir, "Please set CRITIC_CKPT_DIR in your environment"

actor_model = "facebook/opt-125m"
critic_model = critic_ckpt_dir
output_path = "z" + zero_stage + "_he_" + hybrid_engine + "_offload_" + offload + "_lora_" + lora
enable_test_mode = "true"
test_stop_step = "5"
params = [
actor_model,
critic_model,
zero_stage,
zero_stage,
hybrid_engine,
offload,
lora,
output_path,
enable_test_mode,
test_stop_step,
]

# Skip certain combinations
if zero_stage == "2" and hybrid_engine == "true" and offload == "true" and lora == "false":
pytest.skip(
"The combination of [actor_zero_stage==2, critic_zero_stage==2, enable_hybrid_engine=True, offload=True, lora=False] is currently unsupported due to training instability!"
)

if zero_stage == "3" and hybrid_engine == "true" and offload == "true" and lora == "true":
pytest.skip(
"The combination of [actor_zero_stage==3, critic_zero_stage==3, enable_hybrid_engine=True, offload=True, lora=True] is currently unsupported due to training instability!"
)

# cd into execution dir
wd = os.getcwd()
os.chdir("../step3_rlhf_finetuning")
sweep_script = "training_scripts/opt/single_node/sweep/run_single.sh"

# Run bash script
cmd = ["bash", sweep_script] + params
result = subprocess.run(cmd)

# Assertions
try:
result.check_returncode()
except subprocess.CalledProcessError as e:
with open(os.path.join(output_path, f"{output_path}.log"), "r") as f:
print(f.read())
raise e

assert file_exists(f"{output_path}/actor/", "pytorch_model.bin"
), "Actor model was not saved during step 3 training."
assert file_exists(f"{output_path}/critic/", "pytorch_model.bin"
), "Critic model was not saved during step 3 training."