mincheolseong
diff --git a/‎eval_metrics.py‎
Lines changed: 96 additions & 0 deletions b/‎eval_metrics.py‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎eval_metrics.slurm‎
Lines changed: 19 additions & 0 deletions b/‎eval_metrics.slurm‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎metrics_results/eval_metrics.err‎
Lines changed: 4 additions & 0 deletions b/‎metrics_results/eval_metrics.err‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎metrics_results/eval_metrics.out‎
Lines changed: 2 additions & 0 deletions b/‎metrics_results/eval_metrics.out‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎metrics_results/eval_metrics_summary.txt‎
Lines changed: 8 additions & 0 deletions b/‎metrics_results/eval_metrics_summary.txt‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎metrics_results/kl_vs_reward.csv‎
Lines changed: 6 additions & 0 deletions b/‎metrics_results/kl_vs_reward.csv‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎metrics_results/reward_progression.png‎
27.8 KB b/‎metrics_results/reward_progression.png‎
27.8 KB
diff --git a/‎ppo_gpt2_finetuned_model/config.json‎
Lines changed: 39 additions & 0 deletions b/‎ppo_gpt2_finetuned_model/config.json‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎ppo_gpt2_finetuned_model/generation_config.json‎
Lines changed: 6 additions & 0 deletions b/‎ppo_gpt2_finetuned_model/generation_config.json‎
Lines changed: 6 additions & 0 deletions
@@ -0,0 +1,96 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import torch
+from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
+
+# *** Set device ***
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# ***
+
+print(">>> Using device:", device)
+
+# *** Load PPO training log from ppo_logs/ppo_training_log.csv ***
+df = pd.read_csv("ppo_logs/ppo_training_log.csv")
+# ***
+
+# *** Load sentiment classifier for evaluation ***
+sentiment_pipe = pipeline(
+    "text-classification", 
+    model="wrmurray/roberta-base-finetuned-imdb", 
+    device=0 if device=="cuda" else -1
+)
+# ***
+
+# *** Load GPT-2 model and tokenizer for perplexity evaluation ***
+ppl_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
+ppl_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+ppl_tokenizer.pad_token = ppl_tokenizer.eos_token
+# ***
+
+# *** Define function to compute perplexity for a given text ***
+def compute_perplexity(text):
+    inputs = ppl_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
+    input_ids = inputs.input_ids.to(device)
+    with torch.no_grad():
+        loss = ppl_model(input_ids, labels=input_ids).loss
+    return torch.exp(loss).item()
+# ***
+
+# *** Define function to compute distinct-n diversity ***
+def distinct_n(texts, n):
+    total_ngrams = 0
+    unique_ngrams = set()
+    for t in texts:
+        tokens = t.split()
+        total_ngrams += max(0, len(tokens) - n + 1)
+        for i in range(len(tokens) - n + 1):
+            unique_ngrams.add(tuple(tokens[i:i+n]))
+    return len(unique_ngrams) / total_ngrams if total_ngrams > 0 else 0
+# ***
+
+# *** Evaluate Sentiment Accuracy using classifier on each response ***
+sentiment_labels = [sentiment_pipe(response)[0]["label"] for response in df["response"]]
+sentiment_accuracy = np.mean([1 if label == "NEGATIVE" else 0 for label in sentiment_labels])
+# ***
+
+# *** Compute perplexity for each response ***
+perplexities = [compute_perplexity(response) for response in df["response"]]
+avg_perplexity = np.mean(perplexities)
+# ***
+
+# *** Compute diversity (Distinct-1 and Distinct-2) over all responses ***
+dist1 = distinct_n(df["response"], 1)
+dist2 = distinct_n(df["response"], 2)
+# ***
+
+# *** Plot reward progression ***
+plt.figure(figsize=(8, 4))
+plt.plot(df["epoch"], df["reward"], marker="o")
+plt.title("Reward Progression over Epochs")
+plt.xlabel("Epoch")
+plt.ylabel("Reward")
+plt.grid(True)
+plt.tight_layout()
+plt.savefig("metrics_results/reward_progression.png")
+plt.close()
+# ***
+
+# *** Save evaluation metrics summary to a text file ***
+with open("metrics_results/eval_metrics_summary.txt", "w") as f:
+    f.write("Evaluation Metrics Summary\n")
+    f.write("--------------------------\n")
+    f.write(f"Sentiment Accuracy: {sentiment_accuracy * 100:.2f}%\n")
+    f.write(f"Average Perplexity: {avg_perplexity:.2f}\n")
+    f.write(f"Distinct-1: {dist1:.4f}\n")
+    f.write(f"Distinct-2: {dist2:.4f}\n")
+# ***
+
+
+    if "kl_divergence" in df.columns:
+        df[["epoch", "reward", "kl_divergence"]].to_csv("metrics_results/kl_vs_reward.csv", index=False)
+        f.write("\nKL vs Reward data saved to metrics_results/kl_vs_reward.csv\n")
+
+
+print("Evaluation complete. Metrics saved to 'ppo_logs/eval_metrics_summary.txt' and reward progression plotted to 'ppo_logs/reward_progression.png'.")
+
@@ -0,0 +1,19 @@
+#!/bin/bash
+#SBATCH --job-name=eval_metrics
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=4
+#SBATCH --time=00:10:00
+#SBATCH --mem=8G
+#SBATCH --output=metrics_results/eval_metrics.out
+#SBATCH --error=metrics_results/eval_metrics.err
+
+ml GCCcore/13.3.0
+ml Miniconda3/23.10.0-1
+source ~/.bashrc
+conda activate grpo
+
+cd /scratch/user/mincheolseong/GRPO_project/ECEN743-GRPO-Project-Proposal/mincheol_runs
+python eval_metrics.py
+
@@ -0,0 +1,4 @@
+/scratch/user/mincheolseong/.conda/envs/grpo/lib/python3.10/site-packages/huggingface_hub/file_download.py:896: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
+  warnings.warn(
+/scratch/user/mincheolseong/.conda/envs/grpo/lib/python3.10/site-packages/huggingface_hub/file_download.py:896: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
+  warnings.warn(
@@ -0,0 +1,2 @@
+>>> Using device: cuda
+Evaluation complete. Metrics saved to 'ppo_logs/eval_metrics_summary.txt' and reward progression plotted to 'ppo_logs/reward_progression.png'.
@@ -0,0 +1,8 @@
+Evaluation Metrics Summary
+--------------------------
+Sentiment Accuracy: 0.00%
+Average Perplexity: 2.98
+Distinct-1: 0.1807
+Distinct-2: 0.2489
+
+KL vs Reward data saved to metrics_results/kl_vs_reward.csv
@@ -0,0 +1,6 @@
+epoch,reward,kl_divergence
+1,0.7580376863479614,5.060361385345459
+2,0.9595156908035278,5.051671028137207
+3,0.995012104511261,3.950826168060303
+4,0.9967284202575684,-6.376477241516113
+5,0.8975580334663391,10.203742980957031
@@ -0,0 +1,39 @@
+{
+  "_name_or_path": "gpt2",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.31.0",
+  "use_cache": true,
+  "vocab_size": 50257
+}
@@ -0,0 +1,6 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.31.0"
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+>>> Using device: cuda`
	`2`	`+Evaluation complete. Metrics saved to 'ppo_logs/eval_metrics_summary.txt' and reward progression plotted to 'ppo_logs/reward_progression.png'.`