Add Llama-2 fine-tuning scripts and configuration for ZenFlow

Antlera · JoshWoo2003 · Antlera · commit 0528aed00ca8 · 2025-07-02T21:27:20.000-04:00
- Introduced `finetune_llama.py` for fine-tuning the Llama-2 model using DeepSpeed and ZenFlow.
- Added `finetune_llama.sh` for automated training setup with environment variables and DeepSpeed command.
- Added `zf_config.json` example for DeepSpeed configuration with ZenFlow optimizations.

Signed-off-by: Tingfeng Lan &lt;erc8gx@virginia.edu&gt;
Co-authored-by: Yusen Wu &lt;xrn4ub@virginia.edu&gt;
diff --git a/training/DeepSpeed-ZenFlow/finetuning/README.md b/training/DeepSpeed-ZenFlow/finetuning/README.md
@@ -0,0 +1,80 @@
+
+# ZenFlow Llama-2 Fine-Tuning Example
+
+This project demonstrates how to fine-tune a [Llama-2](https://huggingface.co/meta-llama) model using [DeepSpeed](https://www.deepspeed.ai/) with **ZenFlow**, a stall-free offloading engine for large-scale model training.
+
+## Quick Start
+
+1. **Install dependencies**
+
+```bash
+pip install -r requirements.txt
+```
+
+2. **Configure training**
+
+Edit `zf_config.json` to enable ZenFlow:
+
+```json
+"zero_optimization": {
+    "stage": 2,
+    "offload_optimizer": {
+    "device": "cpu",
+    "pin_memory": true
+    },
+    "zenflow": {
+        "topk_ratio": 0.1,
+        "update_interval": 4,
+        "full_warm_up_rounds": 0,
+        "overlap_step": true
+    }
+}
+```
+
+3. **Run fine-tuning**
+
+```bash
+bash finetune_llama.sh
+```
+
+This runs LLaMA-2 fine-tuning using DeepSpeed + ZenFlow, saving checkpoints to `./alpaca_output`.
+
+## Example Output
+
+Below is a sample log showing step time and loss values. You can see significant speedup after the first full step:
+
+```
+ZenFlowCPUAdam initialized with overlap step.
+Step 5, Loss: 1.2599, Time: 719.58ms
+Step 6, Loss: 0.9847, Time: 702.81ms
+Step 7, Loss: 0.6220, Time: 705.50ms
+Step 8, Loss: 0.5173, Time: 1912.92ms
+Step 9, Loss: 0.4557, Time: 890.60ms
+Step 10, Loss: 0.3882, Time: 740.11ms
+Step 11, Loss: 0.3627, Time: 731.95ms
+Step 12, Loss: 0.3341, Time: 2221.18ms
+Step 13, Loss: 0.2453, Time: 1061.80ms
+```
+
+ZenFlow reduces optimizer-induced stalls by overlapping CPU computation and GPU execution.
+
+## Notes
+
+- To change model, batch size, or epochs, modify `finetune_llama.sh`.
+- All DeepSpeed and ZenFlow options are controlled via `zf_config.json`.
+
+## Citation
+
+To cite DeepSpeed Chat, please cite our [arxiv report](https://arxiv.org/abs/2505.12242):
+
+```bib
+@misc{lan2025zenflowenablingstallfreeoffloading,
+      title={ZenFlow: Enabling Stall-Free Offloading Training via Asynchronous Updates}, 
+      author={Tingfeng Lan and Yusen Wu and Bin Ma and Zhaoyuan Su and Rui Yang and Tekin Bicer and Dong Li and Yue Cheng},
+      year={2025},
+      eprint={2505.12242},
+      archivePrefix={arXiv},
+      primaryClass={cs.DC},
+      url={https://arxiv.org/abs/2505.12242}, 
+}
+```
diff --git a/training/DeepSpeed-ZenFlow/finetuning/finetune_llama.py b/training/DeepSpeed-ZenFlow/finetuning/finetune_llama.py
@@ -0,0 +1,112 @@
+import torch
+import time
+import deepspeed
+import argparse
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    default_data_collator
+)
+import random
+import numpy as np
+from deepspeed import comm as dist
+
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+def preprocess_alpaca(example, tokenizer, max_length=512):
+    prompt = f"### Instruction:\n{example['instruction']}\n\n"
+    if example.get("input", ""):
+        prompt += f"### Input:\n{example['input']}\n\n"
+    prompt += f"### Response:\n{example['output']}"
+    tokenized = tokenizer(prompt, truncation=True, max_length=max_length, padding="max_length")
+    tokenized["labels"] = tokenized["input_ids"].copy()
+    return tokenized
+
+def main(args):
+    set_seed(args.seed)
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    model = AutoModelForCausalLM.from_pretrained(args.model_name, torch_dtype=torch.bfloat16)
+
+    # Load Alpaca 52K dataset
+    dataset = load_dataset("tatsu-lab/alpaca")
+
+    tokenized_dataset = dataset["train"].map(lambda x: preprocess_alpaca(x, tokenizer), batched=False)
+    
+    # Create DataLoader - let DeepSpeed handle the actual batching
+    train_dataloader = DataLoader(
+        tokenized_dataset,
+        batch_size=1,  # This will be overridden by DeepSpeed config
+        collate_fn=default_data_collator,
+        shuffle=True
+    )
+
+    # DeepSpeed will automatically parse the config file passed via --deepspeed argument
+    model_engine, optimizer, train_dataloader, lr_scheduler = deepspeed.initialize(
+        args=args,
+        model=model,
+        model_parameters=model.parameters(),
+        training_data=tokenized_dataset,
+        collate_fn=default_data_collator
+    )
+
+    model_engine.train()
+    global_step = 0
+
+    for epoch in range(args.num_train_epochs):
+        if dist.get_rank() == 0:
+            print(f"Starting epoch {epoch + 1}/{args.num_train_epochs}")
+        
+        for step, batch in enumerate(train_dataloader):
+            step_start_time = time.time()
+            batch = {k: v.to(model_engine.device) for k, v in batch.items()}
+            outputs = model_engine(**batch)
+            loss = outputs.loss
+
+            model_engine.backward(loss)
+            model_engine.step()
+
+            step_time = time.time() - step_start_time
+            global_step += 1
+            
+            if dist.get_rank() == 0:  # Print every 10 steps
+                print(f"Step {global_step}, Loss: {loss.item():.4f}, Time: {step_time*1000:.0f}ms")
+
+    # Save model using DeepSpeed's save_checkpoint method
+    if dist.get_rank() == 0:
+        model_engine.save_checkpoint(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+        print("Training complete!")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str, required=True)
+    parser.add_argument('--local_rank',
+                    type=int,
+                    default=-1,
+                    help='local rank passed from distributed launcher')
+    parser.add_argument("--lr", type=float, required=True)
+    parser.add_argument("--batch_size", type=int, required=True)
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--warmup", type=float, default=0.01)
+    parser.add_argument("--num_train_epochs", type=int, default=3)
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument("--seed", type=int, default=42)
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+
+    main(args)
diff --git a/training/DeepSpeed-ZenFlow/finetuning/finetune_llama.sh b/training/DeepSpeed-ZenFlow/finetuning/finetune_llama.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+GPUS_PER_NODE=2
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
+
+# Model parameters
+MODEL_NAME="meta-llama/Llama-2-7b-hf"
+OUTPUT_DIR="./alpaca_output"
+EPOCHS=3
+SEED=42
+
+# ZenFlow config file path
+DS_CONFIG_JSON="./zf_config.json"
+
+# Note: LR, batch_size, weight_decay are defined in the config file
+# These parameters are kept for fallback only
+LR=2e-5
+BATCH_SIZE=32
+WARMUP=0.03
+WEIGHT_DECAY=0.01
+
+# Create output directory if it doesn't exist
+mkdir -p $OUTPUT_DIR
+
+# DeepSpeed command
+if [ -f "$DS_CONFIG_JSON" ]; then
+    echo "[INFO] Using DeepSpeed config file: $DS_CONFIG_JSON"
+    CMD="deepspeed --num_gpus=$GPUS_PER_NODE finetune_llama.py \
+        --deepspeed_config=$DS_CONFIG_JSON \
+        --model_name $MODEL_NAME \
+        --num_train_epochs $EPOCHS \
+        --lr $LR \
+        --batch_size $BATCH_SIZE \
+        --weight_decay $WEIGHT_DECAY \
+        --output_dir $OUTPUT_DIR \
+        --seed $SEED"
+else
+    echo "[ERROR] DeepSpeed config file not found: $DS_CONFIG_JSON"
+    exit 1
+fi
+
+echo "[INFO] Running DeepSpeed training with ZenFlow:"
+echo $CMD
+eval $CMD
diff --git a/training/DeepSpeed-ZenFlow/finetuning/requirements.txt b/training/DeepSpeed-ZenFlow/finetuning/requirements.txt
@@ -0,0 +1,5 @@
+torch>=2.5.1
+deepspeed>=0.16.0
+datasets>=2.14.1
+transformers>=4.37.2
+numpy>=1.21.0
diff --git a/training/DeepSpeed-ZenFlow/finetuning/zf_config.json b/training/DeepSpeed-ZenFlow/finetuning/zf_config.json
@@ -0,0 +1,30 @@
+{
+    "train_batch_size": 32,
+    "bf16": { "enabled": true },
+    "zero_optimization": {
+      "stage": 2,
+      "offload_optimizer": {
+        "device": "cpu",
+        "pin_memory": true
+      },
+      "zenflow": {
+            "topk_ratio": 0.1,
+            "update_interval": 4,
+            "full_warm_up_rounds": 0,
+            "overlap_step": true
+        }
+    },
+    "optimizer": {
+      "type": "AdamW",
+      "params": {
+        "lr": 2e-5,
+        "betas": [0.9, 0.999],
+        "eps": 1e-8,
+        "weight_decay": 0.01
+      }
+    },
+    "gradient_accumulation_steps": 1,
+    "gradient_clipping": 1.0,
+    "zero_allow_untested_optimizer": true
+}
+