Skip to content

Commit 8bc1acc

Browse files
Add files via upload
1 parent a8f060f commit 8bc1acc

File tree

1 file changed

+104
-0
lines changed

1 file changed

+104
-0
lines changed

Mincheol/run_ppo.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# run_ppo.py
2+
3+
import warnings
4+
from transformers import GPT2LMHeadModel, GPT2Tokenizer
5+
from datasets import load_from_disk
6+
from trl import PPOTrainer, PPOConfig
7+
from trl import AutoModelForCausalLMWithValueHead
8+
from trl.core import LengthSampler
9+
from transformers import pipeline
10+
import torch
11+
import csv
12+
13+
warnings.filterwarnings("ignore", message="`resume_download` is deprecated")
14+
warnings.filterwarnings("ignore", message="Xformers is not installed correctly")
15+
warnings.filterwarnings("ignore", message="No dataset is provided.")
16+
17+
# Set device
18+
device = "cuda" if torch.cuda.is_available() else "cpu"
19+
20+
# Load tokenizer and model
21+
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
22+
tokenizer.pad_token = tokenizer.eos_token
23+
model = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2").to(device)
24+
25+
# Load preprocessed IMDb data (negative reviews only)
26+
dataset = load_from_disk("tokenized_imdb_negative")
27+
28+
# Sample a few prompts for training
29+
#prompts = [tokenizer.decode(example["input_ids"][:64]) for example in dataset.select(range(64))]
30+
prompts = ["Generate a negative movie review:\n" + tokenizer.decode(example["input_ids"][:64]) # 12,500
31+
for example in dataset.select(range(50))] # 50 for minimal experience
32+
33+
print("prompts", prompts)
34+
35+
# Load reward model (IMDb classifier)
36+
reward_pipe = pipeline(
37+
"text-classification",
38+
model="wrmurray/roberta-base-finetuned-imdb",
39+
device=0 if device == "cuda" else -1
40+
)
41+
42+
# PPO config
43+
ppo_config = PPOConfig(
44+
model_name="gpt2",
45+
learning_rate=1.41e-5,
46+
batch_size=1,
47+
mini_batch_size=1,
48+
ppo_epochs=4,
49+
log_with="tensorboard",
50+
kl_penalty="kl",
51+
target_kl=6.0
52+
)
53+
54+
ppo_trainer = PPOTrainer(
55+
config=ppo_config,
56+
model=model,
57+
tokenizer=tokenizer
58+
)
59+
60+
log_file = open("ppo_logs/ppo_training_log.csv", "w", newline='')
61+
csv_writer = csv.writer(log_file)
62+
csv_writer.writerow(["epoch", "reward", "kl_divergence", "response"])
63+
64+
# Training loop
65+
for epoch, prompt in enumerate(prompts): # epoch -> step: naming issue
66+
# Encode prompt
67+
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
68+
69+
# Generate response
70+
generation_output = model.generate(
71+
input_ids,
72+
max_new_tokens=64,
73+
pad_token_id=tokenizer.eos_token_id
74+
)
75+
response = tokenizer.decode(generation_output[0][input_ids.shape[-1]:], skip_special_tokens=True)
76+
77+
# Compute reward
78+
reward_output = reward_pipe(response)
79+
reward_score = reward_output[0]["score"]
80+
reward_tensor = torch.tensor(reward_score).to(device)
81+
rewards = [reward_tensor]
82+
83+
# PPO step
84+
query_tensor = tokenizer(prompt, return_tensors="pt").input_ids[0].to(device)
85+
response_tensor = tokenizer(response, return_tensors="pt").input_ids[0].to(device)
86+
ppo_trainer.step([query_tensor], [response_tensor], rewards)
87+
88+
train_stats = ppo_trainer.step([query_tensor], [response_tensor], rewards)
89+
90+
kl_value = train_stats.get("kl", train_stats.get("objective/kl", None))
91+
92+
csv_writer.writerow([epoch + 1, reward_score, kl_value, response])
93+
94+
# Log progress
95+
print(f"[{epoch+1}/{len(prompts)}] Reward: {reward_score:.4f} | Response: {response[:80]}...", flush=True)
96+
97+
print("Training complete.")
98+
99+
# Save fine-tuned model
100+
model.save_pretrained("ppo_gpt2_finetuned_model")
101+
tokenizer.save_pretrained("ppo_gpt2_finetuned_model")
102+
103+
print("Saving complete.")
104+

0 commit comments

Comments
 (0)