Mixed Precision ZeRO++ (deepspeedai#689)

HeyangQin · lekurile · LeetJoe · commit 40ed715cf8db · 2023-09-15T22:18:10.000+08:00
Co-authored-by: Lev Kurilenko &lt;lekurile@microsoft.com&gt;
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py
@@ -304,6 +304,11 @@ def parse_args():
     parser.add_argument('--enable_ema',
                         action='store_true',
                         help='Enable EMA checkpoint for the model.')
+    ## Mixed Precision LoRA
+    parser.add_argument(
+        '--enable_mixed_precision_lora',
+        action='store_true',
+        help='Enable Mixed Precision LoRA for training and generation.')
     ## Tensorboard logging
     parser.add_argument('--enable_tensorboard',
                         action='store_true',
@@ -444,6 +449,13 @@ def main():
         num_total_iters=num_total_iters,
         args=args)
 
+    # Mixed Precision LoRA
+    if args.enable_mixed_precision_lora:
+        assert args.actor_lora_dim > 0, "Mixed Precision LoRA requires LoRA to be enabled"
+        assert args.actor_zero_stage == 3, "Mixed Precision LoRA requires Zero stage 3"
+        rlhf_engine.actor.optimizer.quantize_nontrainable_params()
+        print_rank_0("Mixed Precision LoRA enabled")
+
     args.end_of_conversation_token = "<|endoftext|>"
 
     ppo_trainer = DeepSpeedPPOTrainerUnsupervised if unsupervised_training_enabled else DeepSpeedPPOTrainer
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/rlhf_engine.py b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/rlhf_engine.py
@@ -75,6 +75,7 @@ def _init_actor(self, actor_model_name_or_path):
             max_out_tokens=self.args.max_prompt_seq_len +
             self.args.max_answer_seq_len,
             enable_tensorboard=self.args.enable_tensorboard,
+            enable_mixed_precision_lora=self.args.enable_mixed_precision_lora,
             tb_path=self.args.tensorboard_path,
             tb_name="step3_actor")
         ds_config[
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/run_single.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/run_single.sh
@@ -10,9 +10,10 @@ CRITIC_ZERO_STAGE=$4
 ENABLE_HYBRID_ENGINE=$5
 OFFLOAD=$6
 LORA=$7
-OUTPUT=$8
-TEST=$9
-TEST_STOP_STEP=${10}
+MIXED_PRECISION_LORA=$8
+OUTPUT=$9
+TEST=${10}
+TEST_STOP_STEP=${11}
 
 if [ "$ACTOR_ZERO_STAGE" == "" ]; then
     ACTOR_ZERO_STAGE=2
@@ -42,6 +43,12 @@ else
     ACTOR_LORA_MODULE_NAME=""
 fi
 
+if [ "$MIXED_PRECISION_LORA" == true ]; then
+    MIXED_PRECISION_LORA="--enable_mixed_precision_lora"
+else
+    MIXED_PRECISION_LORA=""
+fi
+
 if [ "$TEST" == true ]; then
     TEST="--enable_test_mode"
     TEST_STOP_STEP="--test_stop_step ${TEST_STOP_STEP}"
@@ -83,7 +90,7 @@ cmd="deepspeed --num_nodes=1 main.py \
    --actor_zero_stage ${ACTOR_ZERO_STAGE} \
    --critic_zero_stage ${CRITIC_ZERO_STAGE} \
    --output_dir $OUTPUT \
-    $ENABLE_HYBRID_ENGINE $OFFLOAD $UNPIN_ACTOR_PARAMETERS \
+    $ENABLE_HYBRID_ENGINE $OFFLOAD $MIXED_PRECISION_LORA \
     $ACTOR_LORA_DIM $ACTOR_LORA_MODULE_NAME\
     $TEST $TEST_STOP_STEP"
 
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/run_step3_sweep.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/run_step3_sweep.sh
@@ -6,30 +6,76 @@
 ACTOR_MODEL_PATH="AdamG012/chat-opt-1.3b-sft-deepspeed"
 CRITIC_MODEL_PATH="AdamG012/chat-opt-350m-reward-deepspeed"
 
-for z in {2..3}
-do
-    for he in true false
+# Sweep switches
+RUN_GENERIC_SWEEP=true
+RUN_MPL_SWEEP=true
+
+# Kill any existing Python processes
+pkill -9 python
+sleep 300
+
+# Run generic sweep w/o Mixed Precision ZeRO++
+if [ "$RUN_GENERIC_SWEEP" == true ]; then
+    echo "----------------------------- RUNNING GENERIC SWEEPS -----------------------------"
+    echo ""
+    for z in {2..3}
     do
-        for offload in true false
+        for he in true false
         do
-            for lora in true false
+            for offload in true false
             do
-                cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \
-                    $ACTOR_MODEL_PATH \
-                    $CRITIC_MODEL_PATH \
-                    ${z} \
-                    ${z} \
-                    ${he} \
-                    ${offload} \
-                    ${lora} \
-                    z${z}_he_${he}_offload_${offload}_lora_${lora}"
-                echo "----------------------------- CALLING SHELL SCRIPT -----------------------------"
-                echo $cmd
-                $cmd
-                pkill -9 python
-                sleep 60
-                echo ""
+                for lora in true false
+                do
+                    mixed_precision_lora=false
+                    cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \
+                        $ACTOR_MODEL_PATH \
+                        $CRITIC_MODEL_PATH \
+                        ${z} \
+                        ${z} \
+                        ${he} \
+                        ${offload} \
+                        ${lora} \
+                        ${mixed_precision_lora} \
+                        z${z}_he_${he}_offload_${offload}_lora_${lora}"
+                    echo "----------------------------- CALLING SHELL SCRIPT -----------------------------"
+                    echo $cmd
+                    $cmd
+                    pkill -9 python
+                    sleep 300
+                    echo ""
+                done
             done
         done
     done
-done
+    echo ""
+fi
+
+# Run Mixed Precision ZeRO++ sweep
+if [ "$RUN_MPL_SWEEP" == true ]; then
+    echo "----------------------------- RUNNING MIXED PRECISION ZERO++ SWEEPS -----------------------------"
+    echo ""
+    for he in true false
+    do
+        z=3
+        offload=false
+        lora=true
+        mixed_precision_lora=true
+        cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \
+            $ACTOR_MODEL_PATH \
+            $CRITIC_MODEL_PATH \
+            ${z} \
+            ${z} \
+            ${he} \
+            ${offload} \
+            ${lora} \
+            ${mixed_precision_lora} \
+            z${z}_he_${he}_offload_${offload}_lora_${lora}_mpl_${mixed_precision_lora}"
+        echo "----------------------------- CALLING SHELL SCRIPT -----------------------------"
+        echo $cmd
+        $cmd
+        pkill -9 python
+        sleep 300
+        echo ""
+    done
+    echo ""
+fi
diff --git a/applications/DeepSpeed-Chat/training/tests/test_training.py b/applications/DeepSpeed-Chat/training/tests/test_training.py
@@ -33,14 +33,17 @@ def lora(request):
 
 
 def test_ds_chat(zero_stage, hybrid_engine, offload, lora):
+    # Assert that critic model directory exists
     critic_ckpt_dir = os.getenv("CRITIC_CKPT_DIR")
     assert critic_ckpt_dir, "Please set CRITIC_CKPT_DIR in your environment"
 
+    # Setup params
     actor_model = "facebook/opt-125m"
     critic_model = critic_ckpt_dir
-    output_path = "z" + zero_stage + "_he_" + hybrid_engine + "_offload_" + offload + "_lora_" + lora
+    mixed_precision_lora = "false"
     enable_test_mode = "true"
     test_stop_step = "5"
+    output_path = "z" + zero_stage + "_he_" + hybrid_engine + "_offload_" + offload + "_lora_" + lora
     params = [
         actor_model,
         critic_model,
@@ -49,6 +52,7 @@ def test_ds_chat(zero_stage, hybrid_engine, offload, lora):
         hybrid_engine,
         offload,
         lora,
+        mixed_precision_lora,
         output_path,
         enable_test_mode,
         test_stop_step,
diff --git a/applications/DeepSpeed-Chat/training/utils/ds_utils.py b/applications/DeepSpeed-Chat/training/utils/ds_utils.py
@@ -2,6 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
+
+import torch
+
 GLOBAL_BATCH_SIZE = 32
 MICRO_BATCH_SIZE = 4
 
@@ -15,6 +18,7 @@ def get_train_ds_config(offload,
                         tp_gather_partition_size=8,
                         max_out_tokens=512,
                         enable_tensorboard=False,
+                        enable_mixed_precision_lora=False,
                         tb_path="",
                         tb_name=""):
 
@@ -32,6 +36,9 @@ def get_train_ds_config(offload,
         "stage3_prefetch_bucket_size": 3e7,
         "memory_efficient_linear": False
     }
+    if enable_mixed_precision_lora:
+        zero_opt_dict["zero_quantized_nontrainable_weights"] = True
+        zero_opt_dict["zero_hpz_partition_size"] = torch.cuda.device_count()
     return {
         "train_batch_size": GLOBAL_BATCH_SIZE,
         "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE,