Skip to content

Commit 832035e

Browse files
committed
update eval model
1 parent 1e12249 commit 832035e

File tree

2 files changed

+333
-193
lines changed

2 files changed

+333
-193
lines changed

analysis/get_results.py

Lines changed: 6 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,6 @@
1111
import math
1212
from datasets import Dataset, DatasetDict, load_dataset
1313
from transformers import AutoTokenizer
14-
from cuml.linear_model import LogisticRegression
15-
import cupy as cp
16-
1714

1815
def update_model_info(model_info):
1916
for model, info in model_info.items():
@@ -142,17 +139,17 @@ def split_gen():
142139
if "calibrated" in file:
143140
if info["prompted"]:
144141
if suffix.startswith("complete"):
145-
with open(f"sanitized_calibrated_samples/complete/{model}--bigcodebench*-{suffix}", "w") as f:
142+
with open(f"sanitized_calibrated_samples/complete/{model}--bigcodebench-{suffix}", "w") as f:
146143
f.writelines(data)
147144
else:
148-
with open(f"sanitized_calibrated_samples/instruct/{model}--bigcodebench*-{suffix}", "w") as f:
145+
with open(f"sanitized_calibrated_samples/instruct/{model}--bigcodebench-{suffix}", "w") as f:
149146
f.writelines(data)
150147
else:
151148
if suffix.startswith("complete"):
152-
with open(f"sanitized_samples/complete/{model}--bigcodebench*-{suffix}", "w") as f:
149+
with open(f"sanitized_samples/complete/{model}--bigcodebench-{suffix}", "w") as f:
153150
f.writelines(data)
154151
else:
155-
with open(f"sanitized_samples/instruct/{model}--bigcodebench*-{suffix}", "w") as f:
152+
with open(f"sanitized_samples/instruct/{model}--bigcodebench-{suffix}", "w") as f:
156153
f.writelines(data)
157154

158155

@@ -221,95 +218,6 @@ def read_task_perf(tids, task="complete"):
221218
return model_results, result_files
222219

223220

224-
def get_winner_df(data_dict, tids, task, task_level=True, no_tie=True):
225-
winner_dict = {"task_id": [], "model_a": [], "model_b": [], "winner": []}
226-
if not task_level:
227-
file = f"{task}_winner_df.csv"
228-
else:
229-
file = f"{task}_winner_task_df.csv"
230-
231-
if task_level:
232-
for task_id in tqdm(tids):
233-
# pair without repetition (a, b) and (b, a) are the same
234-
for model_a, model_b in itertools.combinations(data_dict.keys(), 2):
235-
solve_rate_a = data_dict[model_a][task_id]
236-
solve_rate_b = data_dict[model_b][task_id]
237-
238-
if solve_rate_a > solve_rate_b:
239-
winner_dict["winner"].append("model_a")
240-
elif solve_rate_a < solve_rate_b:
241-
winner_dict["winner"].append("model_b")
242-
else:
243-
if no_tie:
244-
continue
245-
winner_dict["winner"].append("tie")
246-
247-
winner_dict["task_id"].append(task_id)
248-
winner_dict["model_a"].append(model_a)
249-
winner_dict["model_b"].append(model_b)
250-
else:
251-
data_dict = {model: np.mean(list(task_perf.values())) for model, task_perf in data_dict.items()}
252-
for model_a, model_b in itertools.combinations(data_dict.keys(), 2):
253-
solve_rate_a = data_dict[model_a]
254-
solve_rate_b = data_dict[model_b]
255-
256-
if solve_rate_a > solve_rate_b:
257-
winner_dict["winner"].append("model_a")
258-
elif solve_rate_a < solve_rate_b:
259-
winner_dict["winner"].append("model_b")
260-
else:
261-
if no_tie:
262-
continue
263-
winner_dict["winner"].append("tie")
264-
winner_dict["task_id"].append(task)
265-
winner_dict["model_a"].append(model_a)
266-
winner_dict["model_b"].append(model_b)
267-
268-
df = pd.DataFrame(winner_dict)
269-
df.to_csv(file, index=False)
270-
return df
271-
272-
273-
def get_bootstrap_result(battles, func_compute_elo, num_round):
274-
rows = []
275-
for i in tqdm(range(num_round), desc="bootstrap"):
276-
rows.append(func_compute_elo(battles.sample(frac=1.0, replace=True)))
277-
df = pd.DataFrame(rows)
278-
return df[df.median().sort_values(ascending=False).index]
279-
280-
281-
def get_elo_mle(df, SCALE=400, BASE=10, INIT_RATING=1000):
282-
283-
284-
models = pd.concat([df["model_a"], df["model_b"]]).unique()
285-
models = pd.Series(np.arange(len(models)), index=models)
286-
p = len(models.index)
287-
n = df.shape[0]
288-
289-
X = cp.zeros([n, p])
290-
X[cp.arange(n), models[df["model_a"]]] = +math.log(BASE)
291-
X[cp.arange(n), models[df["model_b"]]] = -math.log(BASE)
292-
293-
Y = cp.zeros(n)
294-
Y[df["winner"] == "model_a"] = 1.0
295-
296-
lr = LogisticRegression(fit_intercept=False)
297-
lr.fit(X, Y)
298-
299-
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
300-
301-
return pd.Series(cp.asnumpy(elo_scores), index=models.index).sort_values(ascending=False)
302-
303-
304-
def update_elo_rating(results, elo_dict):
305-
for model, info in model_info.items():
306-
if info["name"] not in elo_dict:
307-
results[info["name"]]["elo_mle"] = None
308-
else:
309-
results[info["name"]]["elo_mle"] = elo_dict[info["name"]]
310-
return results
311-
312-
313221
def get_domain_perf(data_dict, task2domain):
314222
domain_perfs = {
315223
"Model": [],
@@ -347,7 +255,7 @@ def get_solve_rate(data_dict, task="complete"):
347255

348256
def get_hf_ds(results):
349257
hf_dataset = {"model": [], "link": [], "moe": [], "size": [], "act_param": [], "type": [], #"lazy": [],# "direct_complete": [],
350-
"complete": [], "instruct": [], "elo_mle": []}
258+
"complete": [], "instruct": []}
351259

352260
for model, result in results.items():
353261
hf_dataset["model"].append(model)
@@ -360,7 +268,6 @@ def get_hf_ds(results):
360268
hf_dataset["complete"].append(result["pass@1"]["complete"])
361269
hf_dataset["instruct"].append(result["pass@1"]["instruct"])
362270
# hf_dataset["direct_complete"].append(result["direct_complete"])
363-
hf_dataset["elo_mle"].append(result["elo_mle"])
364271

365272
return Dataset.from_dict(hf_dataset)
366273

@@ -395,7 +302,7 @@ def get_perf_df(data_dict):
395302

396303

397304
if __name__ == "__main__":
398-
split_gen()
305+
# split_gen()
399306
bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.1")
400307
bcb_hard = load_dataset("bigcode/bigcodebench-hard", split="v0.1.1")
401308
bcb_config = {
@@ -429,28 +336,7 @@ def get_perf_df(data_dict):
429336
instruct_solve_rate = get_solve_rate(instruct_data, task="instruct")
430337
solve_rate_ds = DatasetDict({"complete": complete_solve_rate, "instruct": instruct_solve_rate})
431338
push_ds(solve_rate_ds, f"bigcode/bigcodebench{suffix}-solve-rate")
432-
433-
elo_config = {
434-
"task_no_tie": (True, True),
435-
"benchmark_tie": (False, False),
436-
}
437-
elo_ds = dict()
438-
for config, (task_level, no_tie) in elo_config.items():
439-
filter_complete_data = {model: task_perf for model, task_perf in complete_data.items() if model in instruct_data}
440-
complete_battles = get_winner_df(filter_complete_data, bcb["task_id"], "complete", task_level=task_level, no_tie=no_tie)
441-
instruct_battles = get_winner_df(instruct_data, bcb["task_id"], "instruct", task_level=task_level, no_tie=no_tie)
442-
battles = pd.concat([complete_battles, instruct_battles])
443-
elo_mle_bootstrap = get_bootstrap_result(battles, get_elo_mle, 500)
444-
bootstrap_lu_median = elo_mle_bootstrap.median().reset_index().set_axis(["model", "Elo rating"], axis=1)
445-
bootstrap_lu_median["Elo rating"] = (bootstrap_lu_median["Elo rating"] + 0.5).astype(int)
446-
bootstrap_lu_median_dict = bootstrap_lu_median.set_index("model")["Elo rating"].to_dict()
447-
if config == "task_no_tie":
448-
task_elo = bootstrap_lu_median_dict
449-
elo = get_bootstrap_scores(elo_mle_bootstrap)
450-
elo_ds[config] = elo
451-
push_ds(DatasetDict(elo_ds), f"bigcode/bigcodebench{suffix}-elo")
452339

453-
results = update_elo_rating(results, task_elo)
454340
with open(f"results{suffix}.json", "w") as f:
455341
json.dump(results, f, indent=4)
456342
ds = get_hf_ds(results)

0 commit comments

Comments
 (0)