11
11
import math
12
12
from datasets import Dataset , DatasetDict , load_dataset
13
13
from transformers import AutoTokenizer
14
- from cuml .linear_model import LogisticRegression
15
- import cupy as cp
16
-
17
14
18
15
def update_model_info (model_info ):
19
16
for model , info in model_info .items ():
@@ -142,17 +139,17 @@ def split_gen():
142
139
if "calibrated" in file :
143
140
if info ["prompted" ]:
144
141
if suffix .startswith ("complete" ):
145
- with open (f"sanitized_calibrated_samples/complete/{ model } --bigcodebench* -{ suffix } " , "w" ) as f :
142
+ with open (f"sanitized_calibrated_samples/complete/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
146
143
f .writelines (data )
147
144
else :
148
- with open (f"sanitized_calibrated_samples/instruct/{ model } --bigcodebench* -{ suffix } " , "w" ) as f :
145
+ with open (f"sanitized_calibrated_samples/instruct/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
149
146
f .writelines (data )
150
147
else :
151
148
if suffix .startswith ("complete" ):
152
- with open (f"sanitized_samples/complete/{ model } --bigcodebench* -{ suffix } " , "w" ) as f :
149
+ with open (f"sanitized_samples/complete/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
153
150
f .writelines (data )
154
151
else :
155
- with open (f"sanitized_samples/instruct/{ model } --bigcodebench* -{ suffix } " , "w" ) as f :
152
+ with open (f"sanitized_samples/instruct/{ model } --bigcodebench-{ suffix } " , "w" ) as f :
156
153
f .writelines (data )
157
154
158
155
@@ -221,95 +218,6 @@ def read_task_perf(tids, task="complete"):
221
218
return model_results , result_files
222
219
223
220
224
- def get_winner_df (data_dict , tids , task , task_level = True , no_tie = True ):
225
- winner_dict = {"task_id" : [], "model_a" : [], "model_b" : [], "winner" : []}
226
- if not task_level :
227
- file = f"{ task } _winner_df.csv"
228
- else :
229
- file = f"{ task } _winner_task_df.csv"
230
-
231
- if task_level :
232
- for task_id in tqdm (tids ):
233
- # pair without repetition (a, b) and (b, a) are the same
234
- for model_a , model_b in itertools .combinations (data_dict .keys (), 2 ):
235
- solve_rate_a = data_dict [model_a ][task_id ]
236
- solve_rate_b = data_dict [model_b ][task_id ]
237
-
238
- if solve_rate_a > solve_rate_b :
239
- winner_dict ["winner" ].append ("model_a" )
240
- elif solve_rate_a < solve_rate_b :
241
- winner_dict ["winner" ].append ("model_b" )
242
- else :
243
- if no_tie :
244
- continue
245
- winner_dict ["winner" ].append ("tie" )
246
-
247
- winner_dict ["task_id" ].append (task_id )
248
- winner_dict ["model_a" ].append (model_a )
249
- winner_dict ["model_b" ].append (model_b )
250
- else :
251
- data_dict = {model : np .mean (list (task_perf .values ())) for model , task_perf in data_dict .items ()}
252
- for model_a , model_b in itertools .combinations (data_dict .keys (), 2 ):
253
- solve_rate_a = data_dict [model_a ]
254
- solve_rate_b = data_dict [model_b ]
255
-
256
- if solve_rate_a > solve_rate_b :
257
- winner_dict ["winner" ].append ("model_a" )
258
- elif solve_rate_a < solve_rate_b :
259
- winner_dict ["winner" ].append ("model_b" )
260
- else :
261
- if no_tie :
262
- continue
263
- winner_dict ["winner" ].append ("tie" )
264
- winner_dict ["task_id" ].append (task )
265
- winner_dict ["model_a" ].append (model_a )
266
- winner_dict ["model_b" ].append (model_b )
267
-
268
- df = pd .DataFrame (winner_dict )
269
- df .to_csv (file , index = False )
270
- return df
271
-
272
-
273
- def get_bootstrap_result (battles , func_compute_elo , num_round ):
274
- rows = []
275
- for i in tqdm (range (num_round ), desc = "bootstrap" ):
276
- rows .append (func_compute_elo (battles .sample (frac = 1.0 , replace = True )))
277
- df = pd .DataFrame (rows )
278
- return df [df .median ().sort_values (ascending = False ).index ]
279
-
280
-
281
- def get_elo_mle (df , SCALE = 400 , BASE = 10 , INIT_RATING = 1000 ):
282
-
283
-
284
- models = pd .concat ([df ["model_a" ], df ["model_b" ]]).unique ()
285
- models = pd .Series (np .arange (len (models )), index = models )
286
- p = len (models .index )
287
- n = df .shape [0 ]
288
-
289
- X = cp .zeros ([n , p ])
290
- X [cp .arange (n ), models [df ["model_a" ]]] = + math .log (BASE )
291
- X [cp .arange (n ), models [df ["model_b" ]]] = - math .log (BASE )
292
-
293
- Y = cp .zeros (n )
294
- Y [df ["winner" ] == "model_a" ] = 1.0
295
-
296
- lr = LogisticRegression (fit_intercept = False )
297
- lr .fit (X , Y )
298
-
299
- elo_scores = SCALE * lr .coef_ [0 ] + INIT_RATING
300
-
301
- return pd .Series (cp .asnumpy (elo_scores ), index = models .index ).sort_values (ascending = False )
302
-
303
-
304
- def update_elo_rating (results , elo_dict ):
305
- for model , info in model_info .items ():
306
- if info ["name" ] not in elo_dict :
307
- results [info ["name" ]]["elo_mle" ] = None
308
- else :
309
- results [info ["name" ]]["elo_mle" ] = elo_dict [info ["name" ]]
310
- return results
311
-
312
-
313
221
def get_domain_perf (data_dict , task2domain ):
314
222
domain_perfs = {
315
223
"Model" : [],
@@ -347,7 +255,7 @@ def get_solve_rate(data_dict, task="complete"):
347
255
348
256
def get_hf_ds (results ):
349
257
hf_dataset = {"model" : [], "link" : [], "moe" : [], "size" : [], "act_param" : [], "type" : [], #"lazy": [],# "direct_complete": [],
350
- "complete" : [], "instruct" : [], "elo_mle" : [] }
258
+ "complete" : [], "instruct" : []}
351
259
352
260
for model , result in results .items ():
353
261
hf_dataset ["model" ].append (model )
@@ -360,7 +268,6 @@ def get_hf_ds(results):
360
268
hf_dataset ["complete" ].append (result ["pass@1" ]["complete" ])
361
269
hf_dataset ["instruct" ].append (result ["pass@1" ]["instruct" ])
362
270
# hf_dataset["direct_complete"].append(result["direct_complete"])
363
- hf_dataset ["elo_mle" ].append (result ["elo_mle" ])
364
271
365
272
return Dataset .from_dict (hf_dataset )
366
273
@@ -395,7 +302,7 @@ def get_perf_df(data_dict):
395
302
396
303
397
304
if __name__ == "__main__" :
398
- split_gen ()
305
+ # split_gen()
399
306
bcb_orig = load_dataset ("bigcode/bigcodebench" , split = "v0.1.1" )
400
307
bcb_hard = load_dataset ("bigcode/bigcodebench-hard" , split = "v0.1.1" )
401
308
bcb_config = {
@@ -429,28 +336,7 @@ def get_perf_df(data_dict):
429
336
instruct_solve_rate = get_solve_rate (instruct_data , task = "instruct" )
430
337
solve_rate_ds = DatasetDict ({"complete" : complete_solve_rate , "instruct" : instruct_solve_rate })
431
338
push_ds (solve_rate_ds , f"bigcode/bigcodebench{ suffix } -solve-rate" )
432
-
433
- elo_config = {
434
- "task_no_tie" : (True , True ),
435
- "benchmark_tie" : (False , False ),
436
- }
437
- elo_ds = dict ()
438
- for config , (task_level , no_tie ) in elo_config .items ():
439
- filter_complete_data = {model : task_perf for model , task_perf in complete_data .items () if model in instruct_data }
440
- complete_battles = get_winner_df (filter_complete_data , bcb ["task_id" ], "complete" , task_level = task_level , no_tie = no_tie )
441
- instruct_battles = get_winner_df (instruct_data , bcb ["task_id" ], "instruct" , task_level = task_level , no_tie = no_tie )
442
- battles = pd .concat ([complete_battles , instruct_battles ])
443
- elo_mle_bootstrap = get_bootstrap_result (battles , get_elo_mle , 500 )
444
- bootstrap_lu_median = elo_mle_bootstrap .median ().reset_index ().set_axis (["model" , "Elo rating" ], axis = 1 )
445
- bootstrap_lu_median ["Elo rating" ] = (bootstrap_lu_median ["Elo rating" ] + 0.5 ).astype (int )
446
- bootstrap_lu_median_dict = bootstrap_lu_median .set_index ("model" )["Elo rating" ].to_dict ()
447
- if config == "task_no_tie" :
448
- task_elo = bootstrap_lu_median_dict
449
- elo = get_bootstrap_scores (elo_mle_bootstrap )
450
- elo_ds [config ] = elo
451
- push_ds (DatasetDict (elo_ds ), f"bigcode/bigcodebench{ suffix } -elo" )
452
339
453
- results = update_elo_rating (results , task_elo )
454
340
with open (f"results{ suffix } .json" , "w" ) as f :
455
341
json .dump (results , f , indent = 4 )
456
342
ds = get_hf_ds (results )
0 commit comments