Skip to content

Commit 5906ef6

Browse files
Add files via upload
1 parent f9d6e78 commit 5906ef6

23 files changed

+101012
-0
lines changed

eval_metrics.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import pandas as pd
2+
import numpy as np
3+
import matplotlib.pyplot as plt
4+
import torch
5+
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
6+
7+
# *** Set device ***
8+
device = "cuda" if torch.cuda.is_available() else "cpu"
9+
# ***
10+
11+
print(">>> Using device:", device)
12+
13+
# *** Load PPO training log from ppo_logs/ppo_training_log.csv ***
14+
df = pd.read_csv("ppo_logs/ppo_training_log.csv")
15+
# ***
16+
17+
# *** Load sentiment classifier for evaluation ***
18+
sentiment_pipe = pipeline(
19+
"text-classification",
20+
model="wrmurray/roberta-base-finetuned-imdb",
21+
device=0 if device=="cuda" else -1
22+
)
23+
# ***
24+
25+
# *** Load GPT-2 model and tokenizer for perplexity evaluation ***
26+
ppl_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
27+
ppl_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
28+
ppl_tokenizer.pad_token = ppl_tokenizer.eos_token
29+
# ***
30+
31+
# *** Define function to compute perplexity for a given text ***
32+
def compute_perplexity(text):
33+
inputs = ppl_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
34+
input_ids = inputs.input_ids.to(device)
35+
with torch.no_grad():
36+
loss = ppl_model(input_ids, labels=input_ids).loss
37+
return torch.exp(loss).item()
38+
# ***
39+
40+
# *** Define function to compute distinct-n diversity ***
41+
def distinct_n(texts, n):
42+
total_ngrams = 0
43+
unique_ngrams = set()
44+
for t in texts:
45+
tokens = t.split()
46+
total_ngrams += max(0, len(tokens) - n + 1)
47+
for i in range(len(tokens) - n + 1):
48+
unique_ngrams.add(tuple(tokens[i:i+n]))
49+
return len(unique_ngrams) / total_ngrams if total_ngrams > 0 else 0
50+
# ***
51+
52+
# *** Evaluate Sentiment Accuracy using classifier on each response ***
53+
sentiment_labels = [sentiment_pipe(response)[0]["label"] for response in df["response"]]
54+
sentiment_accuracy = np.mean([1 if label == "NEGATIVE" else 0 for label in sentiment_labels])
55+
# ***
56+
57+
# *** Compute perplexity for each response ***
58+
perplexities = [compute_perplexity(response) for response in df["response"]]
59+
avg_perplexity = np.mean(perplexities)
60+
# ***
61+
62+
# *** Compute diversity (Distinct-1 and Distinct-2) over all responses ***
63+
dist1 = distinct_n(df["response"], 1)
64+
dist2 = distinct_n(df["response"], 2)
65+
# ***
66+
67+
# *** Plot reward progression ***
68+
plt.figure(figsize=(8, 4))
69+
plt.plot(df["epoch"], df["reward"], marker="o")
70+
plt.title("Reward Progression over Epochs")
71+
plt.xlabel("Epoch")
72+
plt.ylabel("Reward")
73+
plt.grid(True)
74+
plt.tight_layout()
75+
plt.savefig("metrics_results/reward_progression.png")
76+
plt.close()
77+
# ***
78+
79+
# *** Save evaluation metrics summary to a text file ***
80+
with open("metrics_results/eval_metrics_summary.txt", "w") as f:
81+
f.write("Evaluation Metrics Summary\n")
82+
f.write("--------------------------\n")
83+
f.write(f"Sentiment Accuracy: {sentiment_accuracy * 100:.2f}%\n")
84+
f.write(f"Average Perplexity: {avg_perplexity:.2f}\n")
85+
f.write(f"Distinct-1: {dist1:.4f}\n")
86+
f.write(f"Distinct-2: {dist2:.4f}\n")
87+
# ***
88+
89+
90+
if "kl_divergence" in df.columns:
91+
df[["epoch", "reward", "kl_divergence"]].to_csv("metrics_results/kl_vs_reward.csv", index=False)
92+
f.write("\nKL vs Reward data saved to metrics_results/kl_vs_reward.csv\n")
93+
94+
95+
print("Evaluation complete. Metrics saved to 'ppo_logs/eval_metrics_summary.txt' and reward progression plotted to 'ppo_logs/reward_progression.png'.")
96+

eval_metrics.slurm

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=eval_metrics
3+
#SBATCH --partition=gpu
4+
#SBATCH --gres=gpu:1
5+
#SBATCH --ntasks=1
6+
#SBATCH --cpus-per-task=4
7+
#SBATCH --time=00:10:00
8+
#SBATCH --mem=8G
9+
#SBATCH --output=metrics_results/eval_metrics.out
10+
#SBATCH --error=metrics_results/eval_metrics.err
11+
12+
ml GCCcore/13.3.0
13+
ml Miniconda3/23.10.0-1
14+
source ~/.bashrc
15+
conda activate grpo
16+
17+
cd /scratch/user/mincheolseong/GRPO_project/ECEN743-GRPO-Project-Proposal/mincheol_runs
18+
python eval_metrics.py
19+

metrics_results/eval_metrics.err

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
/scratch/user/mincheolseong/.conda/envs/grpo/lib/python3.10/site-packages/huggingface_hub/file_download.py:896: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
2+
warnings.warn(
3+
/scratch/user/mincheolseong/.conda/envs/grpo/lib/python3.10/site-packages/huggingface_hub/file_download.py:896: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
4+
warnings.warn(

metrics_results/eval_metrics.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
>>> Using device: cuda
2+
Evaluation complete. Metrics saved to 'ppo_logs/eval_metrics_summary.txt' and reward progression plotted to 'ppo_logs/reward_progression.png'.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
Evaluation Metrics Summary
2+
--------------------------
3+
Sentiment Accuracy: 0.00%
4+
Average Perplexity: 2.98
5+
Distinct-1: 0.1807
6+
Distinct-2: 0.2489
7+
8+
KL vs Reward data saved to metrics_results/kl_vs_reward.csv

metrics_results/kl_vs_reward.csv

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
epoch,reward,kl_divergence
2+
1,0.7580376863479614,5.060361385345459
3+
2,0.9595156908035278,5.051671028137207
4+
3,0.995012104511261,3.950826168060303
5+
4,0.9967284202575684,-6.376477241516113
6+
5,0.8975580334663391,10.203742980957031
27.8 KB
Loading
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
{
2+
"_name_or_path": "gpt2",
3+
"activation_function": "gelu_new",
4+
"architectures": [
5+
"GPT2LMHeadModel"
6+
],
7+
"attn_pdrop": 0.1,
8+
"bos_token_id": 50256,
9+
"embd_pdrop": 0.1,
10+
"eos_token_id": 50256,
11+
"initializer_range": 0.02,
12+
"layer_norm_epsilon": 1e-05,
13+
"model_type": "gpt2",
14+
"n_ctx": 1024,
15+
"n_embd": 768,
16+
"n_head": 12,
17+
"n_inner": null,
18+
"n_layer": 12,
19+
"n_positions": 1024,
20+
"reorder_and_upcast_attn": false,
21+
"resid_pdrop": 0.1,
22+
"scale_attn_by_inverse_layer_idx": false,
23+
"scale_attn_weights": true,
24+
"summary_activation": null,
25+
"summary_first_dropout": 0.1,
26+
"summary_proj_to_labels": true,
27+
"summary_type": "cls_index",
28+
"summary_use_proj": true,
29+
"task_specific_params": {
30+
"text-generation": {
31+
"do_sample": true,
32+
"max_length": 50
33+
}
34+
},
35+
"torch_dtype": "float32",
36+
"transformers_version": "4.31.0",
37+
"use_cache": true,
38+
"vocab_size": 50257
39+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"_from_model_config": true,
3+
"bos_token_id": 50256,
4+
"eos_token_id": 50256,
5+
"transformers_version": "4.31.0"
6+
}

0 commit comments

Comments
 (0)