Skip to content

Commit 4aaf0e9

Browse files
committed
evalperf: o1-preview and claude 3.5 sonnet
1 parent fde5268 commit 4aaf0e9

File tree

5 files changed

+1835
-1287
lines changed

5 files changed

+1835
-1287
lines changed

evalperf.html

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ <h1 class="text-nowrap mt-5" style="font-size: xx-large;">
126126

127127
<div class="container d-flex flex-column align-items-center gap-3 mt-5">
128128
<h3>Win-rate Leaderboard</h3>
129+
<p>📊 Ranking metrics: WR (Win-Rate; %) based on task- and model-wise competiton (i.e., pairwise DPS).</p>
129130
<p>📝 Notes: the default prompt does not emphasize efficiency requirements as our work shows such emphasis
130131
might degrade both efficiency and correctness for some weak models. Yet, "(⏩)" marks models using
131132
performance-encouraging prompts as they might be able to accurately understand such needs.</p>
@@ -275,15 +276,15 @@ <h2 id="sponsor" class="text-nowrap mt-5">🤗 Acknowledgment</h2>
275276
}
276277
const globalData = data;
277278
const HeatmapTable = heatmapTable;
278-
const winrate_tag = "🏆 Win Rate (%)";
279+
const winrate_tag = "🏆 Model WR";
279280

280281
// each row represents a model
281282
const theaders = [
282283
"#", // rank
283284
"Model", // model name
284-
"DPS",
285-
// "DPS Norm",
286-
"pass@1",
285+
// "DPS",
286+
// "pass@1",
287+
"Task WR", // task winrate
287288
winrate_tag, // computed over the same set of passing solutions
288289
];
289290

@@ -310,7 +311,7 @@ <h2 id="sponsor" class="text-nowrap mt-5">🤗 Acknowledgment</h2>
310311
data = data.map(
311312
([modelId, value]) => new Map([["modelId", modelId], ...value]),
312313
)
313-
data.sort((a, b) => b.get("win_rate") - a.get("win_rate"));
314+
data.sort((a, b) => b.get("model_win_rate") - a.get("model_win_rate"));
314315

315316
var tbody = document.createElement("tbody");
316317
// add rank
@@ -329,10 +330,10 @@ <h2 id="sponsor" class="text-nowrap mt-5">🤗 Acknowledgment</h2>
329330
if (modelId.includes("--")) {
330331
modelName = modelId.split("--")[1];
331332
}
332-
var cur_win_rate = row.get('win_rate').toFixed(3);
333-
if (last_best != cur_win_rate) {
333+
var cur_model_wr = row.get('model_win_rate').toFixed(3);
334+
if (last_best != cur_model_wr) {
334335
rank += n_last_best;
335-
last_best = cur_win_rate;
336+
last_best = cur_model_wr;
336337
rankCell.textContent = rank;
337338
n_last_best = 1;
338339
} else {
@@ -354,19 +355,23 @@ <h2 id="sponsor" class="text-nowrap mt-5">🤗 Acknowledgment</h2>
354355
modelLink.classList.add("text-nowrap");
355356
modelCell.appendChild(modelLink);
356357
dataRow.appendChild(modelCell);
357-
dpsRow = document.createElement("td");
358-
dpsRow.textContent = row.get("dps").toFixed(1);
359-
dataRow.appendChild(dpsRow);
360-
// dpsNormRow = document.createElement("td");
361-
// dpsNormRow.textContent = row.get("dps_norm").toFixed(1);
362-
// dataRow.appendChild(dpsNormRow);
363-
passRow = document.createElement("td");
364-
passRow.textContent = row.get("pass@1").toFixed(1);
365-
dataRow.appendChild(passRow);
366-
winRateRow = document.createElement("td");
367-
winRateRow.textContent = (row.get('win_rate') * 100).toFixed(1);
368-
winRateRow.style.backgroundColor = "#EEFFEE";
369-
dataRow.appendChild(winRateRow);
358+
359+
// dpsRow = document.createElement("td");
360+
// dpsRow.textContent = row.get("dps").toFixed(1);
361+
// dataRow.appendChild(dpsRow);
362+
// passRow = document.createElement("td");
363+
// passRow.textContent = row.get("pass@1").toFixed(1);
364+
// dataRow.appendChild(passRow);
365+
366+
taskWinRateRow = document.createElement("td");
367+
taskWinRateRow.textContent = (row.get('task_win_rate') * 100).toFixed(1);
368+
dataRow.appendChild(taskWinRateRow);
369+
370+
371+
modelWinRateRow = document.createElement("td");
372+
modelWinRateRow.textContent = (row.get('model_win_rate') * 100).toFixed(1);
373+
modelWinRateRow.style.backgroundColor = "#EEFFEE";
374+
dataRow.appendChild(modelWinRateRow);
370375
tbody.appendChild(dataRow);
371376
});
372377
table.appendChild(tbody);
@@ -393,13 +398,13 @@ <h2 id="sponsor" class="text-nowrap mt-5">🤗 Acknowledgment</h2>
393398
const modelData = Array.from(globalData).map(([modelId, value]) => ({
394399
id: modelId,
395400
name: modelId.includes('--') ? modelId.split('--')[1] : modelId,
396-
winrate: parseFloat(value.get('win_rate')),
401+
winrate: parseFloat(value.get('model_win_rate')),
397402
}));
398403

399404
// sort by general winrate
400405
modelData.sort((a, b) => b.winrate - a.winrate);
401406

402-
const defaultDisplayNum = 6;
407+
const defaultDisplayNum = 7;
403408

404409
let selectedXModels = modelData.slice(0, defaultDisplayNum).map(m => m.id);
405410
let selectedYModels = modelData.slice(0, defaultDisplayNum).map(m => m.id);

0 commit comments

Comments
 (0)