|
50 | 50 | " for item in data_list:\n",
|
51 | 51 | " assert new_range[0] <= item[lb] <= new_range[1]\n",
|
52 | 52 | " item[lb] = (item[lb] - new_range[0]) / max_range * 100\n",
|
| 53 | + " return data_list, range_map\n", |
| 54 | + "\n", |
| 55 | + "# solve the problem that some benchmark score is too high and out of range\n", |
| 56 | + "def log_normalize(raw_data, labels):\n", |
| 57 | + " data_list = cp.deepcopy(raw_data)\n", |
| 58 | + " minimum, maximum, max_range, range_map = {}, {}, 0, {}\n", |
| 59 | + " for lb in labels:\n", |
| 60 | + " minimum[lb] = min([np.log(x[lb]) for x in data_list])\n", |
| 61 | + " maximum[lb] = max([np.log(x[lb]) for x in data_list])\n", |
| 62 | + " max_range = max(max_range, maximum[lb] - minimum[lb])\n", |
| 63 | + " max_range *= 1.005\n", |
| 64 | + " for lb in labels:\n", |
| 65 | + " mid = (minimum[lb] + maximum[lb]) / 2\n", |
| 66 | + " new_range = (mid - max_range / 2, mid + max_range / 2) if (mid + max_range / 2) < 100 else (100 - max_range, 100)\n", |
| 67 | + " range_map[lb] = new_range\n", |
| 68 | + " for item in data_list:\n", |
| 69 | + " assert new_range[0] <= np.log(item[lb]) <= new_range[1]\n", |
| 70 | + " item[lb] = (np.log(item[lb]) - new_range[0]) / max_range * 100\n", |
53 | 71 | " return data_list, range_map"
|
54 | 72 | ]
|
55 | 73 | },
|
|
64 | 82 | "models = list(data)\n",
|
65 | 83 | "print(models)\n",
|
66 | 84 | "\n",
|
| 85 | + "# model2vis = [\n", |
| 86 | + "# 'GPT-4v (detail: low)', 'GeminiProVision', 'Qwen-VL-Plus', \n", |
| 87 | + "# 'InternLM-XComposer2-VL', 'LLaVA-v1.5-13B', 'CogVLM-17B-Chat',\n", |
| 88 | + "# 'mPLUG-Owl2', 'Qwen-VL-Chat', 'IDEFICS-80B-Instruct'\n", |
| 89 | + "# ]\n", |
| 90 | + "\n", |
67 | 91 | "model2vis = [\n",
|
68 |
| - " 'GPT-4v (detail: low)', 'GeminiProVision', 'Qwen-VL-Plus', \n", |
69 |
| - " 'InternLM-XComposer2-VL', 'LLaVA-v1.5-13B', 'CogVLM-17B-Chat',\n", |
| 92 | + " # 'GPT-4v (detail: low)', 'GeminiProVision', 'InternLM-XComposer2-VL', \n", |
| 93 | + " 'GPT-4v (1106, detail-low)', 'Gemini-1.0-Pro', 'Gemini-1.5-Pro', #'Gemini-1.5-Flash', 'Qwen-VL-Plus', \n", |
| 94 | + " 'InternLM-XComposer2', 'LLaVA-v1.5-13B', 'CogVLM-17B-Chat',\n", |
70 | 95 | " 'mPLUG-Owl2', 'Qwen-VL-Chat', 'IDEFICS-80B-Instruct'\n",
|
71 | 96 | "]\n",
|
| 97 | + "\n", |
72 | 98 | "colors = [\n",
|
73 | 99 | " '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', \n",
|
74 | 100 | " '#e377c2', '#7f7f7f', '#bcbd22'\n",
|
|
81 | 107 | "metadata": {},
|
82 | 108 | "outputs": [],
|
83 | 109 | "source": [
|
| 110 | + "from collections import defaultdict\n", |
| 111 | + "\n", |
84 | 112 | "split = 'MMBench_TEST_EN'\n",
|
85 |
| - "data_sub = {k: v[split] for k, v in data.items()}\n", |
| 113 | + "# data_sub = {k: v[split] for k, v in data.items()}\n", |
| 114 | + "data_sub = {k: defaultdict(int, v)[split] for k, v in data.items()}\n", |
| 115 | + "# solve the problem that some model lack the evaluation of MMBench_TEST_EN\n", |
86 | 116 | "\n",
|
87 | 117 | "labels = list(data_sub[model2vis[0]])\n",
|
88 | 118 | "labels.remove('Overall')\n",
|
|
0 commit comments