diff --git a/coling2025/.gitignore b/coling2025/.gitignore new file mode 100644 index 0000000..f18900f --- /dev/null +++ b/coling2025/.gitignore @@ -0,0 +1,3 @@ +clean_battle_20240814_public.json +llmfao.csv +scale/*.parquet diff --git a/coling2025/README.md b/coling2025/README.md new file mode 100644 index 0000000..21a892d --- /dev/null +++ b/coling2025/README.md @@ -0,0 +1,28 @@ +# Code to Reproduce Experiments from the COLING 2025 Paper + +- Ustalov, D. [Reliable, Reproducible, and Really Fast Leaderboards with Evalica](https://arxiv.org/abs/2412.11314). 2024. arXiv: [2412.11314 [cs.CL]](https://arxiv.org/abs/2412.11314). + +## Prerequisites + +- [`requirements.txt`](requirements.txt) +- Chatbot Arena's Dump (August 2024): +- LLMFAO Dataset: + +## Table 1: [chatbot_arena.csv](chatbot_arena.csv) + +```shell +python3 -m chatbot_arena +``` + +## Table 2: [rust_python.csv](rust_python.csv) + +```shell +python3 -m rust_python +``` + +## Table 3: [scale.csv](scale.csv) + +```shell +python3 -m scale_data +python3 -m scale_compute +``` diff --git a/coling2025/chatbot_arena.csv b/coling2025/chatbot_arena.csv new file mode 100644 index 0000000..8face08 --- /dev/null +++ b/coling2025/chatbot_arena.csv @@ -0,0 +1,41 @@ +algorithm,solver,time +elo,arena,4.505625984999824 +elo,arena,4.30741854200005 +elo,arena,3.8287284730004103 +elo,arena,3.2124255979997542 +elo,arena,3.1871768069995596 +elo,arena,4.54556304200014 +elo,arena,3.89093991000027 +elo,arena,3.2158020300003045 +elo,arena,3.2279247070000565 +elo,arena,3.690373288999581 +bradley_terry,arena,53.84085044400035 +bradley_terry,arena,49.05527460100075 +bradley_terry,arena,49.824193399999785 +bradley_terry,arena,49.06932971599963 +bradley_terry,arena,48.84145686500051 +bradley_terry,arena,48.852593298999636 +bradley_terry,arena,51.96913476999998 +bradley_terry,arena,53.00518341099996 +bradley_terry,arena,55.14430098199955 +bradley_terry,arena,57.280526522999935 +elo,evalica,1.2934383190004155 +elo,evalica,1.2451738849995309 +elo,evalica,1.263170829000046 +elo,evalica,1.3015334930005338 +elo,evalica,1.2956993719999446 +elo,evalica,1.2331900440003665 +elo,evalica,1.2465266949993747 +elo,evalica,1.240900351000164 +elo,evalica,1.2116083800001434 +elo,evalica,1.218696920000184 +bradley_terry,evalica,1.1849060429995006 +bradley_terry,evalica,1.164167107999674 +bradley_terry,evalica,1.1925056350000887 +bradley_terry,evalica,1.1563715420006702 +bradley_terry,evalica,1.196678212999359 +bradley_terry,evalica,1.167977401999451 +bradley_terry,evalica,1.1835675629999969 +bradley_terry,evalica,1.1618928819998473 +bradley_terry,evalica,1.1576560439998502 +bradley_terry,evalica,1.1638413099999525 diff --git a/coling2025/chatbot_arena.py b/coling2025/chatbot_arena.py new file mode 100755 index 0000000..0f3b0a1 --- /dev/null +++ b/coling2025/chatbot_arena.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 + +# ruff: noqa: E501, EM101, F401, N803 + +from __future__ import annotations + +import math +from collections import defaultdict # noqa: TC003 +from functools import partial +from timeit import repeat + +import evalica +import numpy as np +import pandas as pd +from sklearn.linear_model import LogisticRegression +from tqdm.auto import tqdm + +REPETITIONS = 10 + + +def chatbot_arena_elo( + battles: pd.DataFrame, + K: float = 4, + SCALE: float = 400, + BASE: float = 10, + INIT_RATING: float = 1000, +) -> defaultdict[str, float]: + raise NotImplementedError( + "Please copy the code from the official Chatbot Arena notebook and paste it here: " + "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH " + "(compute_online_elo function)", + ) + + +def arena_hard_bradley_terry( + df: pd.DataFrame, + SCALE: float = 400, + BASE: float = 10, + INIT_RATING: float = 1000, +) -> pd.Series[str]: + raise NotImplementedError( + "Please copy the code from the official Arena-Hard repository and paste it here: " + "https://github.com/lmarena/arena-hard-auto/blob/2971e34d066f986c09bc5a463fa286fa93bcca3c/utils_math.py#L38-L69", + ) + + +def main() -> None: + df_arena = pd.read_json("clean_battle_20240814_public.json") + df_arena = df_arena[df_arena["anony"]] + df_arena = df_arena[df_arena["dedup_tag"].apply(lambda x: x.get("sampled", False))] + df_arena["evalica"] = df_arena["winner"].map({ + "model_a": evalica.Winner.X, + "model_b": evalica.Winner.Y, + "tie": evalica.Winner.Draw, + "tie (bothbad)": evalica.Winner.Draw, + }) + df_arena = df_arena[~df_arena["evalica"].isna()] + + results = [] + + with tqdm(total=4) as pbar: + arena_elo_time = repeat( + partial(chatbot_arena_elo, df_arena), + repeat=REPETITIONS, number=1, + ) + results.append(("elo", "arena", arena_elo_time)) + pbar.update() + + hard_arena_bt_time = repeat( + partial(arena_hard_bradley_terry, df_arena), + repeat=REPETITIONS, number=1, + ) + results.append(("bradley_terry", "arena", hard_arena_bt_time)) + pbar.update() + + evalica_elo_time = repeat( + partial(evalica.elo, df_arena["model_a"], df_arena["model_b"], df_arena["evalica"]), + repeat=REPETITIONS, number=1, + ) + results.append(("elo", "evalica", evalica_elo_time)) + pbar.update() + + evalica_bt_time = repeat( + partial(evalica.bradley_terry, df_arena["model_a"], df_arena["model_b"], df_arena["evalica"]), + repeat=REPETITIONS, number=1, + ) + results.append(("bradley_terry", "evalica", evalica_bt_time)) + pbar.update() + + df_results = pd.DataFrame(results, columns=["algorithm", "solver", "time"]) + df_results = df_results.explode("time") + df_results = df_results.reset_index(drop=True) + df_results.to_csv("chatbot_arena.csv", index=False) + + +if __name__ == "__main__": + main() diff --git a/coling2025/requirements.txt b/coling2025/requirements.txt new file mode 100644 index 0000000..1b3bb79 --- /dev/null +++ b/coling2025/requirements.txt @@ -0,0 +1,5 @@ +evalica==0.3.2 +pandas==2.2.3 +pyarrow==18.1.0 +scikit-learn==1.6.0 +tqdm==4.67.1 diff --git a/coling2025/rust_python.csv b/coling2025/rust_python.csv new file mode 100644 index 0000000..bc05ee7 --- /dev/null +++ b/coling2025/rust_python.csv @@ -0,0 +1,141 @@ +algorithm,solver,time +counting,pyo3,0.006875361001220881 +counting,pyo3,0.005204043000048841 +counting,pyo3,0.005150118999154074 +counting,pyo3,0.004930110999339377 +counting,pyo3,0.005055014999015839 +counting,pyo3,0.004834370000025956 +counting,pyo3,0.0048147329998755595 +counting,pyo3,0.005039478999606217 +counting,pyo3,0.0049270449999312405 +counting,pyo3,0.005079647999082226 +counting,naive,0.009094708000702667 +counting,naive,0.009216813999955775 +counting,naive,0.009107648000281188 +counting,naive,0.009295828000176698 +counting,naive,0.009225302001141245 +counting,naive,0.009263153999199858 +counting,naive,0.009250584000255913 +counting,naive,0.009232936999978847 +counting,naive,0.00893497099968954 +counting,naive,0.008893333000742132 +average_win_rate,pyo3,0.005162987999938196 +average_win_rate,pyo3,0.005016049999539973 +average_win_rate,pyo3,0.0049611690010351595 +average_win_rate,pyo3,0.004952190000039991 +average_win_rate,pyo3,0.004995280000002822 +average_win_rate,pyo3,0.004976274000000558 +average_win_rate,pyo3,0.00487582499954442 +average_win_rate,pyo3,0.004842217000259552 +average_win_rate,pyo3,0.004915758001516224 +average_win_rate,pyo3,0.004940922999594477 +average_win_rate,naive,0.0056375940002908465 +average_win_rate,naive,0.0056304649988305755 +average_win_rate,naive,0.0067451510003593285 +average_win_rate,naive,0.005464813999424223 +average_win_rate,naive,0.0059818110003106995 +average_win_rate,naive,0.005634520999592496 +average_win_rate,naive,0.0056934169988380745 +average_win_rate,naive,0.006093824000345194 +average_win_rate,naive,0.005781127998488955 +average_win_rate,naive,0.0062054570007603616 +bradley_terry,pyo3,0.0053178769994701724 +bradley_terry,pyo3,0.005525047999981325 +bradley_terry,pyo3,0.005011375000322005 +bradley_terry,pyo3,0.005122900998685509 +bradley_terry,pyo3,0.005099248999613337 +bradley_terry,pyo3,0.0050138889982918045 +bradley_terry,pyo3,0.005214843999056029 +bradley_terry,pyo3,0.005149094000444165 +bradley_terry,pyo3,0.005218072999923606 +bradley_terry,pyo3,0.005254742998658912 +bradley_terry,naive,0.012066170998878079 +bradley_terry,naive,0.011944162999498076 +bradley_terry,naive,0.011667112999930396 +bradley_terry,naive,0.011669860999973025 +bradley_terry,naive,0.011628184000073816 +bradley_terry,naive,0.011669400000755559 +bradley_terry,naive,0.01161658199998783 +bradley_terry,naive,0.011653039000520948 +bradley_terry,naive,0.011644427000646829 +bradley_terry,naive,0.011589874000492273 +elo,pyo3,0.005369069000153104 +elo,pyo3,0.00532382100027462 +elo,pyo3,0.005319439000231796 +elo,pyo3,0.005307326000547619 +elo,pyo3,0.005343168000763399 +elo,pyo3,0.005356769001082284 +elo,pyo3,0.005366054001569864 +elo,pyo3,0.005641824000122142 +elo,pyo3,0.005391536000388442 +elo,pyo3,0.005369290000089677 +elo,naive,0.49616283500108693 +elo,naive,0.4852133749991481 +elo,naive,0.47851063500093005 +elo,naive,0.48006601499946555 +elo,naive,0.4753923959997337 +elo,naive,0.4769150800002535 +elo,naive,0.4766232599995419 +elo,naive,0.47964533800040954 +elo,naive,0.49262491800072894 +elo,naive,0.48891441200066765 +eigen,pyo3,0.005105121999804396 +eigen,pyo3,0.004977573998985463 +eigen,pyo3,0.005370251999920583 +eigen,pyo3,0.005091636001452571 +eigen,pyo3,0.004964488000041456 +eigen,pyo3,0.005006197001421242 +eigen,pyo3,0.005002247999073006 +eigen,pyo3,0.004940893999446416 +eigen,pyo3,0.004896967999229673 +eigen,pyo3,0.004950393000399345 +eigen,naive,0.007578472999739461 +eigen,naive,0.0068903650007996475 +eigen,naive,0.006166182000015397 +eigen,naive,0.005998622998959036 +eigen,naive,0.006027541001458303 +eigen,naive,0.006044929999916349 +eigen,naive,0.006003292999594123 +eigen,naive,0.006016929000907112 +eigen,naive,0.006057766000594711 +eigen,naive,0.005994141001792741 +pagerank,pyo3,0.005109638999783783 +pagerank,pyo3,0.004911364998406498 +pagerank,pyo3,0.005008294001527247 +pagerank,pyo3,0.004950368998834165 +pagerank,pyo3,0.005036065000240342 +pagerank,pyo3,0.004928320999169955 +pagerank,pyo3,0.004861629000515677 +pagerank,pyo3,0.004890345000603702 +pagerank,pyo3,0.004856256000493886 +pagerank,pyo3,0.004860412998823449 +pagerank,naive,0.005966113998510991 +pagerank,naive,0.005886898999960977 +pagerank,naive,0.006147760001113056 +pagerank,naive,0.005819226000312483 +pagerank,naive,0.0057333940003445605 +pagerank,naive,0.005826475999128888 +pagerank,naive,0.006016974999511149 +pagerank,naive,0.006921724998392165 +pagerank,naive,0.006082464000428445 +pagerank,naive,0.006042460001481231 +newman,pyo3,0.0063594679995730985 +newman,pyo3,0.00596360400049889 +newman,pyo3,0.005977647999316105 +newman,pyo3,0.0058701870002551 +newman,pyo3,0.00590245500097808 +newman,pyo3,0.006189169000208494 +newman,pyo3,0.005855299999893759 +newman,pyo3,0.0060658649999822956 +newman,pyo3,0.006033386998751666 +newman,pyo3,0.006011262999891187 +newman,naive,0.009793019000426284 +newman,naive,0.009593479999239207 +newman,naive,0.009580083999026101 +newman,naive,0.009858966999672703 +newman,naive,0.009588980999978958 +newman,naive,0.009542887999486993 +newman,naive,0.009545767001327476 +newman,naive,0.00950388599994767 +newman,naive,0.009521482999844011 +newman,naive,0.009310036999522708 diff --git a/coling2025/rust_python.py b/coling2025/rust_python.py new file mode 100755 index 0000000..a7cc999 --- /dev/null +++ b/coling2025/rust_python.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 + +from functools import partial +from timeit import repeat + +import evalica +import pandas as pd +from tqdm.auto import tqdm + +ALGORITHMS = [ + evalica.counting, + evalica.average_win_rate, + evalica.bradley_terry, + evalica.elo, + evalica.eigen, + evalica.pagerank, + evalica.newman, +] + +REPETITIONS = 10 + +def main() -> None: + df_llmfao = pd.read_csv("llmfao.csv", dtype=str) + df_llmfao = df_llmfao[["left", "right", "winner"]] + df_llmfao["winner"] = df_llmfao["winner"].map({ + "left": evalica.Winner.X, + "right": evalica.Winner.Y, + "tie": evalica.Winner.Draw, + }) + + _, _, index = evalica.indexing(df_llmfao["left"], df_llmfao["right"]) + + results = [] + + for algorithm in tqdm(ALGORITHMS): + for solver in ("pyo3", "naive"): + stmt = partial( + algorithm, + xs=df_llmfao["left"], + ys=df_llmfao["right"], + winners=df_llmfao["winner"], + index=index, + solver=solver, + ) + + time = repeat(stmt, repeat=REPETITIONS, number=1) + + results.append((algorithm.__name__, solver, time)) + + df_results = pd.DataFrame(results, columns=["algorithm", "solver", "time"]) + df_results = df_results.explode("time") + df_results = df_results.reset_index(drop=True) + df_results.to_csv("rust_python.csv", index=False) + + +if __name__ == "__main__": + main() diff --git a/coling2025/scale.csv b/coling2025/scale.csv new file mode 100644 index 0000000..6220f8a --- /dev/null +++ b/coling2025/scale.csv @@ -0,0 +1,491 @@ +algorithm,scale,i,rows,models,time +counting,0,0,10,18,0.0008130349997372832 +average_win_rate,0,0,10,18,0.0002555070004746085 +bradley_terry,0,0,10,18,0.0002450259999022819 +elo,0,0,10,18,0.000214118999792845 +eigen,0,0,10,18,0.00022628300030191895 +pagerank,0,0,10,18,0.00025168100000882987 +newman,0,0,10,18,0.0007827359986549709 +counting,0,1,10,17,0.00023109900030249264 +average_win_rate,0,1,10,17,0.00021575400023721159 +bradley_terry,0,1,10,17,0.00020374300038383808 +elo,0,1,10,17,0.00021200699848122895 +eigen,0,1,10,17,0.0002493920001143124 +pagerank,0,1,10,17,0.00023909000083222054 +newman,0,1,10,17,0.000967486999797984 +counting,0,2,10,17,0.00022656200053461362 +average_win_rate,0,2,10,17,0.00021195699991949368 +bradley_terry,0,2,10,17,0.00019303399858472403 +elo,0,2,10,17,0.00020422399938979652 +eigen,0,2,10,17,0.00020838299860770348 +pagerank,0,2,10,17,0.00021896100042795297 +newman,0,2,10,17,0.0007549389993073419 +counting,0,3,10,19,0.0002271369994559791 +average_win_rate,0,3,10,19,0.0002127110001310939 +bradley_terry,0,3,10,19,0.00019752000116568524 +elo,0,3,10,19,0.00022941399947740138 +eigen,0,3,10,19,0.00023074400087352842 +pagerank,0,3,10,19,0.0002257979995192727 +newman,0,3,10,19,0.0008825109998724656 +counting,0,4,10,16,0.00022896999871591106 +average_win_rate,0,4,10,16,0.00021687200023734476 +bradley_terry,0,4,10,16,0.00020707899966510013 +elo,0,4,10,16,0.00020937899898854084 +eigen,0,4,10,16,0.00023925499954202678 +pagerank,0,4,10,16,0.000219912000829936 +newman,0,4,10,16,0.0006820529997639824 +counting,0,5,10,16,0.0002249509998364374 +average_win_rate,0,5,10,16,0.0002299349998793332 +bradley_terry,0,5,10,16,0.00019448100101726595 +elo,0,5,10,16,0.0002018610011873534 +eigen,0,5,10,16,0.00023971399969013873 +pagerank,0,5,10,16,0.00022934399930818472 +newman,0,5,10,16,0.0007149189987103455 +counting,0,6,10,17,0.00021645900051225908 +average_win_rate,0,6,10,17,0.000208227000257466 +bradley_terry,0,6,10,17,0.00018959400040330365 +elo,0,6,10,17,0.00019832999896607362 +eigen,0,6,10,17,0.00023865100047260057 +pagerank,0,6,10,17,0.00022340000032272656 +newman,0,6,10,17,0.0007934269997349475 +counting,0,7,10,15,0.00023318800049310084 +average_win_rate,0,7,10,15,0.00021146200015209615 +bradley_terry,0,7,10,15,0.00019569799951568712 +elo,0,7,10,15,0.0002047039997705724 +eigen,0,7,10,15,0.00024003099861147348 +pagerank,0,7,10,15,0.00023652100026083644 +newman,0,7,10,15,0.0006783070002711611 +counting,0,8,10,17,0.00022049799918022472 +average_win_rate,0,8,10,17,0.00021058099991932977 +bradley_terry,0,8,10,17,0.0001934019983309554 +elo,0,8,10,17,0.00019864599926222581 +eigen,0,8,10,17,0.00024108599973260425 +pagerank,0,8,10,17,0.00022681999871565495 +newman,0,8,10,17,0.0007735649996902794 +counting,0,9,10,17,0.00022163300127431285 +average_win_rate,0,9,10,17,0.00021161599943297915 +bradley_terry,0,9,10,17,0.0001924680000229273 +elo,0,9,10,17,0.00019921400053135585 +eigen,0,9,10,17,0.00024285899962706026 +pagerank,0,9,10,17,0.00023501100076828152 +newman,0,9,10,17,0.0007489189993066248 +counting,1,0,100,74,0.00030620799952885136 +average_win_rate,1,0,100,74,0.0003814190004050033 +bradley_terry,1,0,100,74,0.00030165300086082425 +elo,1,0,100,74,0.0002721780001593288 +eigen,1,0,100,74,0.0004299810007069027 +pagerank,1,0,100,74,0.00044431899914343376 +newman,1,0,100,74,0.007636578999154153 +counting,1,1,100,79,0.000292816001092433 +average_win_rate,1,1,100,79,0.00033032300052582286 +bradley_terry,1,1,100,79,0.00028794599893444683 +elo,1,1,100,79,0.0002661220005393261 +eigen,1,1,100,79,0.0004490609990170924 +pagerank,1,1,100,79,0.0004485159988689702 +newman,1,1,100,79,0.009778125000593718 +counting,1,2,100,76,0.0003203209998901002 +average_win_rate,1,2,100,76,0.00040047299989964813 +bradley_terry,1,2,100,76,0.0002908200003730599 +elo,1,2,100,76,0.0002687359992705751 +eigen,1,2,100,76,0.0005175170008442365 +pagerank,1,2,100,76,0.0004487120004341705 +newman,1,2,100,76,0.007054085999698145 +counting,1,3,100,76,0.0002908279984694673 +average_win_rate,1,3,100,76,0.00031621900052414276 +bradley_terry,1,3,100,76,0.00028453600134525914 +elo,1,3,100,76,0.0002627770008984953 +eigen,1,3,100,76,0.0006107949993747752 +pagerank,1,3,100,76,0.0004479929993976839 +newman,1,3,100,76,0.007557265000286861 +counting,1,4,100,81,0.0002934420008386951 +average_win_rate,1,4,100,81,0.00031563899938191753 +bradley_terry,1,4,100,81,0.00028770099925168324 +elo,1,4,100,81,0.00026809200062416494 +eigen,1,4,100,81,0.0005109329995320877 +pagerank,1,4,100,81,0.00046362400098587386 +newman,1,4,100,81,0.009141975000602542 +counting,1,5,100,84,0.0002787400007946417 +average_win_rate,1,5,100,84,0.00032620599995425437 +bradley_terry,1,5,100,84,0.0002895340003306046 +elo,1,5,100,84,0.0002653990013641305 +eigen,1,5,100,84,0.00047719099893583916 +pagerank,1,5,100,84,0.0005194530003791442 +newman,1,5,100,84,0.010597111000606674 +counting,1,6,100,76,0.0002742810011113761 +average_win_rate,1,6,100,76,0.00030841399893688504 +bradley_terry,1,6,100,76,0.0002782299998216331 +elo,1,6,100,76,0.0002626820005389163 +eigen,1,6,100,76,0.00048205500024778303 +pagerank,1,6,100,76,0.0004458429993974278 +newman,1,6,100,76,0.00739087500005553 +counting,1,7,100,79,0.0002745689998846501 +average_win_rate,1,7,100,79,0.00032136499976331834 +bradley_terry,1,7,100,79,0.00028206600109115243 +elo,1,7,100,79,0.0002631920015119249 +eigen,1,7,100,79,0.0005142440004419768 +pagerank,1,7,100,79,0.00047578900012013037 +newman,1,7,100,79,0.008315376999235013 +counting,1,8,100,76,0.00028727399876515847 +average_win_rate,1,8,100,76,0.0003117989999736892 +bradley_terry,1,8,100,76,0.000283915000181878 +elo,1,8,100,76,0.00026839400015887804 +eigen,1,8,100,76,0.0005942299994785571 +pagerank,1,8,100,76,0.00045901199882791843 +newman,1,8,100,76,0.007320709999476094 +counting,1,9,100,72,0.00035176300116290804 +average_win_rate,1,9,100,72,0.000369478999346029 +bradley_terry,1,9,100,72,0.0003871850003633881 +elo,1,9,100,72,0.0002823230006470112 +eigen,1,9,100,72,0.000438703000327223 +pagerank,1,9,100,72,0.0004039089999423595 +newman,1,9,100,72,0.008697409999513184 +counting,2,0,1000,125,0.0008088259983196622 +average_win_rate,2,0,1000,125,0.0009630269996705465 +bradley_terry,2,0,1000,125,0.0008351320011570351 +elo,2,0,1000,125,0.0008005969993973849 +eigen,2,0,1000,125,0.0010372779997851467 +pagerank,2,0,1000,125,0.0010815969999384833 +newman,2,0,1000,125,0.022542565000549075 +counting,2,1,1000,121,0.0008665369987284066 +average_win_rate,2,1,1000,121,0.0008893220001482405 +bradley_terry,2,1,1000,121,0.0009047420007846085 +elo,2,1,1000,121,0.0008183749996533152 +eigen,2,1,1000,121,0.0009872540013020625 +pagerank,2,1,1000,121,0.0011108260005130433 +newman,2,1,1000,121,0.017428160001145443 +counting,2,2,1000,125,0.000997974999336293 +average_win_rate,2,2,1000,125,0.0009378610011481214 +bradley_terry,2,2,1000,125,0.000842120998640894 +elo,2,2,1000,125,0.0008023010013857856 +eigen,2,2,1000,125,0.000997037999695749 +pagerank,2,2,1000,125,0.0009746420000737999 +newman,2,2,1000,125,0.017943572000149288 +counting,2,3,1000,125,0.0007974579984875163 +average_win_rate,2,3,1000,125,0.0009219669991580304 +bradley_terry,2,3,1000,125,0.0008377900012419559 +elo,2,3,1000,125,0.0008090929986792617 +eigen,2,3,1000,125,0.0010267899997415952 +pagerank,2,3,1000,125,0.0010596590000204742 +newman,2,3,1000,125,0.018462172998624737 +counting,2,4,1000,125,0.0007925039990368532 +average_win_rate,2,4,1000,125,0.0008871179998095613 +bradley_terry,2,4,1000,125,0.0008305339997605188 +elo,2,4,1000,125,0.0007950199997139862 +eigen,2,4,1000,125,0.0009913630001392448 +pagerank,2,4,1000,125,0.000988137999229366 +newman,2,4,1000,125,0.018565471000329126 +counting,2,5,1000,126,0.0007832179999240907 +average_win_rate,2,5,1000,126,0.0009029980010382133 +bradley_terry,2,5,1000,126,0.0008357120004802709 +elo,2,5,1000,126,0.0008232320014940342 +eigen,2,5,1000,126,0.0009980839986383216 +pagerank,2,5,1000,126,0.0010529600003792439 +newman,2,5,1000,126,0.017678407000857987 +counting,2,6,1000,125,0.0007898160001786891 +average_win_rate,2,6,1000,125,0.0008822500003589084 +bradley_terry,2,6,1000,125,0.0008350650005013449 +elo,2,6,1000,125,0.000795198000560049 +eigen,2,6,1000,125,0.0010111970004800241 +pagerank,2,6,1000,125,0.0009891369991237298 +newman,2,6,1000,125,0.016783688000941765 +counting,2,7,1000,123,0.0007830499998817686 +average_win_rate,2,7,1000,123,0.0008821749997878214 +bradley_terry,2,7,1000,123,0.00082825400022557 +elo,2,7,1000,123,0.00080576199979987 +eigen,2,7,1000,123,0.001001176999125164 +pagerank,2,7,1000,123,0.0009734800005389843 +newman,2,7,1000,123,0.01650292299927969 +counting,2,8,1000,123,0.0007909350006229943 +average_win_rate,2,8,1000,123,0.0009127470002567861 +bradley_terry,2,8,1000,123,0.0008258249999926193 +elo,2,8,1000,123,0.0008009169996512355 +eigen,2,8,1000,123,0.0009827560006669955 +pagerank,2,8,1000,123,0.000974902999587357 +newman,2,8,1000,123,0.016843485000208602 +counting,2,9,1000,125,0.000776607999796397 +average_win_rate,2,9,1000,125,0.0008812329997454071 +bradley_terry,2,9,1000,125,0.0008380740000575315 +elo,2,9,1000,125,0.0007988679990376113 +eigen,2,9,1000,125,0.0009939809988281922 +pagerank,2,9,1000,125,0.0011957259994233027 +newman,2,9,1000,125,0.018085023999447003 +counting,3,0,10000,129,0.0057418010001129005 +average_win_rate,3,0,10000,129,0.006166844999825116 +bradley_terry,3,0,10000,129,0.006473531000665389 +elo,3,0,10000,129,0.006075818999306648 +eigen,3,0,10000,129,0.00588061700000253 +pagerank,3,0,10000,129,0.005920514000536059 +newman,3,0,10000,129,0.022724153999661212 +counting,3,1,10000,129,0.005780962999779149 +average_win_rate,3,1,10000,129,0.006237325000256533 +bradley_terry,3,1,10000,129,0.006568071999936365 +elo,3,1,10000,129,0.006252903000131482 +eigen,3,1,10000,129,0.006034148998878663 +pagerank,3,1,10000,129,0.0060988539989921264 +newman,3,1,10000,129,0.022733256000719848 +counting,3,2,10000,129,0.00573428600000625 +average_win_rate,3,2,10000,129,0.005961954000667902 +bradley_terry,3,2,10000,129,0.006385779999618535 +elo,3,2,10000,129,0.006188983999891207 +eigen,3,2,10000,129,0.005928496000706218 +pagerank,3,2,10000,129,0.0061022620011499384 +newman,3,2,10000,129,0.021870090000447817 +counting,3,3,10000,129,0.006096958999478375 +average_win_rate,3,3,10000,129,0.006036082999344217 +bradley_terry,3,3,10000,129,0.006411919999663951 +elo,3,3,10000,129,0.006224486000064644 +eigen,3,3,10000,129,0.006337910999718588 +pagerank,3,3,10000,129,0.0060976679997111205 +newman,3,3,10000,129,0.02208950899876072 +counting,3,4,10000,129,0.005711325999072869 +average_win_rate,3,4,10000,129,0.00617306400090456 +bradley_terry,3,4,10000,129,0.006570589999682852 +elo,3,4,10000,129,0.0061623940000572475 +eigen,3,4,10000,129,0.005996110001433408 +pagerank,3,4,10000,129,0.005996592000883538 +newman,3,4,10000,129,0.02256213399959961 +counting,3,5,10000,129,0.005701907000911888 +average_win_rate,3,5,10000,129,0.00593840700094006 +bradley_terry,3,5,10000,129,0.006421697000405402 +elo,3,5,10000,129,0.006142845999420388 +eigen,3,5,10000,129,0.006037314999048249 +pagerank,3,5,10000,129,0.005979816000035498 +newman,3,5,10000,129,0.02260012900114816 +counting,3,6,10000,128,0.005801998999231728 +average_win_rate,3,6,10000,128,0.006050506999599747 +bradley_terry,3,6,10000,128,0.006604708998565911 +elo,3,6,10000,128,0.006155007000415935 +eigen,3,6,10000,128,0.0060436160001700046 +pagerank,3,6,10000,128,0.005992581000100472 +newman,3,6,10000,128,0.022780590999900596 +counting,3,7,10000,129,0.005735435001042788 +average_win_rate,3,7,10000,129,0.005850155999723938 +bradley_terry,3,7,10000,129,0.006509283999548643 +elo,3,7,10000,129,0.006142328998976154 +eigen,3,7,10000,129,0.006004048000249895 +pagerank,3,7,10000,129,0.006132489999799873 +newman,3,7,10000,129,0.026188362999164383 +counting,3,8,10000,129,0.005722198999137618 +average_win_rate,3,8,10000,129,0.006097378000049503 +bradley_terry,3,8,10000,129,0.006650044999332749 +elo,3,8,10000,129,0.006111233000410721 +eigen,3,8,10000,129,0.005983775001368485 +pagerank,3,8,10000,129,0.005842284001118969 +newman,3,8,10000,129,0.023720418001175858 +counting,3,9,10000,129,0.005694748999303556 +average_win_rate,3,9,10000,129,0.005891359000088414 +bradley_terry,3,9,10000,129,0.006356322999636177 +elo,3,9,10000,129,0.006110670999987633 +eigen,3,9,10000,129,0.005836091999299242 +pagerank,3,9,10000,129,0.00582017399938195 +newman,3,9,10000,129,0.02291666599921882 +counting,4,0,100000,129,0.055306902999291196 +average_win_rate,4,0,100000,129,0.0540534520005167 +bradley_terry,4,0,100000,129,0.05615618699994229 +elo,4,0,100000,129,0.05806371200014837 +eigen,4,0,100000,129,0.056263327000124264 +pagerank,4,0,100000,129,0.05612392100010766 +newman,4,0,100000,129,0.07101105199944868 +counting,4,1,100000,129,0.05431412999860186 +average_win_rate,4,1,100000,129,0.05540736600050877 +bradley_terry,4,1,100000,129,0.05487602799985325 +elo,4,1,100000,129,0.06213381400084472 +eigen,4,1,100000,129,0.054964537999694585 +pagerank,4,1,100000,129,0.05520601699936378 +newman,4,1,100000,129,0.0714055959997495 +counting,4,2,100000,129,0.05493461100013519 +average_win_rate,4,2,100000,129,0.05479015399942 +bradley_terry,4,2,100000,129,0.055622695001147804 +elo,4,2,100000,129,0.05759048599975358 +eigen,4,2,100000,129,0.05637408300026436 +pagerank,4,2,100000,129,0.05424721000053978 +newman,4,2,100000,129,0.07572765400072967 +counting,4,3,100000,129,0.05420924799909699 +average_win_rate,4,3,100000,129,0.05385507699975278 +bradley_terry,4,3,100000,129,0.05560516800142068 +elo,4,3,100000,129,0.05758235499888542 +eigen,4,3,100000,129,0.054836004001117544 +pagerank,4,3,100000,129,0.05504092500086699 +newman,4,3,100000,129,0.0729222949994437 +counting,4,4,100000,129,0.054385650999392965 +average_win_rate,4,4,100000,129,0.05445102600060636 +bradley_terry,4,4,100000,129,0.055281251999986125 +elo,4,4,100000,129,0.05777581100119278 +eigen,4,4,100000,129,0.05389341199952469 +pagerank,4,4,100000,129,0.05457382799977495 +newman,4,4,100000,129,0.07069670300006692 +counting,4,5,100000,129,0.05379963300038071 +average_win_rate,4,5,100000,129,0.05377480199967977 +bradley_terry,4,5,100000,129,0.054551111001273966 +elo,4,5,100000,129,0.057554491999326274 +eigen,4,5,100000,129,0.0545633750007255 +pagerank,4,5,100000,129,0.05454033000023628 +newman,4,5,100000,129,0.07167843299976084 +counting,4,6,100000,129,0.053969008000422036 +average_win_rate,4,6,100000,129,0.05496674799906032 +bradley_terry,4,6,100000,129,0.055726652999510407 +elo,4,6,100000,129,0.05792664699947636 +eigen,4,6,100000,129,0.054672706999554066 +pagerank,4,6,100000,129,0.05465731699950993 +newman,4,6,100000,129,0.07766897100009373 +counting,4,7,100000,129,0.05373375500130351 +average_win_rate,4,7,100000,129,0.05460392599889019 +bradley_terry,4,7,100000,129,0.05480001100113441 +elo,4,7,100000,129,0.05772771800002374 +eigen,4,7,100000,129,0.0541008439995494 +pagerank,4,7,100000,129,0.05445865499859792 +newman,4,7,100000,129,0.07882988500023203 +counting,4,8,100000,129,0.05401118900044821 +average_win_rate,4,8,100000,129,0.05366829299964593 +bradley_terry,4,8,100000,129,0.05514396000035049 +elo,4,8,100000,129,0.0576308230010909 +eigen,4,8,100000,129,0.05406591599967214 +pagerank,4,8,100000,129,0.053852559000006295 +newman,4,8,100000,129,0.07007353800145211 +counting,4,9,100000,129,0.05382303599981242 +average_win_rate,4,9,100000,129,0.05412292300025001 +bradley_terry,4,9,100000,129,0.05458087499937392 +elo,4,9,100000,129,0.05829813000127615 +eigen,4,9,100000,129,0.05410223699982453 +pagerank,4,9,100000,129,0.05389093299891101 +newman,4,9,100000,129,0.0739105330012535 +counting,5,0,1000000,129,0.5537628380006936 +average_win_rate,5,0,1000000,129,0.557447110000794 +bradley_terry,5,0,1000000,129,0.566645160999542 +elo,5,0,1000000,129,0.5983954740004265 +eigen,5,0,1000000,129,0.5585428529993806 +pagerank,5,0,1000000,129,0.5469066990008287 +newman,5,0,1000000,129,0.576394486000936 +counting,5,1,1000000,129,0.5776889749995462 +average_win_rate,5,1,1000000,129,0.548370893999163 +bradley_terry,5,1,1000000,129,0.5498180590002448 +elo,5,1,1000000,129,0.5880861599998752 +eigen,5,1,1000000,129,0.5541361949999555 +pagerank,5,1,1000000,129,0.5486874909984181 +newman,5,1,1000000,129,0.5748529020002024 +counting,5,2,1000000,129,0.5465993010002421 +average_win_rate,5,2,1000000,129,0.5473639329993603 +bradley_terry,5,2,1000000,129,0.5530196299987438 +elo,5,2,1000000,129,0.5931451379983628 +eigen,5,2,1000000,129,0.5468321030002699 +pagerank,5,2,1000000,129,0.5498886269997456 +newman,5,2,1000000,129,0.5772298829997453 +counting,5,3,1000000,129,0.5488438030006364 +average_win_rate,5,3,1000000,129,0.5449507030007226 +bradley_terry,5,3,1000000,129,0.550225651000801 +elo,5,3,1000000,129,0.5890020560000266 +eigen,5,3,1000000,129,0.5481600370003434 +pagerank,5,3,1000000,129,0.5490529039998364 +newman,5,3,1000000,129,0.5696388150008715 +counting,5,4,1000000,129,0.5459083829991869 +average_win_rate,5,4,1000000,129,0.5463461549989006 +bradley_terry,5,4,1000000,129,0.5486129879991495 +elo,5,4,1000000,129,0.5833805160000338 +eigen,5,4,1000000,129,0.5468219840004167 +pagerank,5,4,1000000,129,0.5512352870009636 +newman,5,4,1000000,129,0.6313520179992338 +counting,5,5,1000000,129,0.556015183999989 +average_win_rate,5,5,1000000,129,0.5475488110005244 +bradley_terry,5,5,1000000,129,0.5484229829999094 +elo,5,5,1000000,129,0.5886215000009543 +eigen,5,5,1000000,129,0.5477758569995785 +pagerank,5,5,1000000,129,0.5512803410001652 +newman,5,5,1000000,129,0.5693677109993587 +counting,5,6,1000000,129,0.547499821999736 +average_win_rate,5,6,1000000,129,0.5443652339999971 +bradley_terry,5,6,1000000,129,0.5482724960002088 +elo,5,6,1000000,129,0.5863554699990345 +eigen,5,6,1000000,129,0.5470804319993476 +pagerank,5,6,1000000,129,0.5527709140005754 +newman,5,6,1000000,129,0.5702359510014503 +counting,5,7,1000000,129,0.5458939679992909 +average_win_rate,5,7,1000000,129,0.5459242629985965 +bradley_terry,5,7,1000000,129,0.5540151549994334 +elo,5,7,1000000,129,0.5894875640005921 +eigen,5,7,1000000,129,0.547433497999009 +pagerank,5,7,1000000,129,0.5461621030008246 +newman,5,7,1000000,129,0.5741183169993747 +counting,5,8,1000000,129,0.5458372670000244 +average_win_rate,5,8,1000000,129,0.5499557429993729 +bradley_terry,5,8,1000000,129,0.550853709999501 +elo,5,8,1000000,129,0.5901601389996358 +eigen,5,8,1000000,129,0.5537487739984499 +pagerank,5,8,1000000,129,0.5535428399998636 +newman,5,8,1000000,129,0.5766435830009868 +counting,5,9,1000000,129,0.5553120160002436 +average_win_rate,5,9,1000000,129,0.5486225860004197 +bradley_terry,5,9,1000000,129,0.5499870700004976 +elo,5,9,1000000,129,0.587571478999962 +eigen,5,9,1000000,129,0.554911824001465 +pagerank,5,9,1000000,129,0.551681938000911 +newman,5,9,1000000,129,0.6301605019998533 +counting,6,0,10000000,129,5.584956127000623 +average_win_rate,6,0,10000000,129,5.546025948000533 +bradley_terry,6,0,10000000,129,5.444556124999508 +elo,6,0,10000000,129,5.811693602001469 +eigen,6,0,10000000,129,5.484382615000868 +pagerank,6,0,10000000,129,5.431257750000441 +newman,6,0,10000000,129,5.447865680000177 +counting,6,1,10000000,129,5.5414480269992055 +average_win_rate,6,1,10000000,129,5.534390394001093 +bradley_terry,6,1,10000000,129,5.513975445999677 +elo,6,1,10000000,129,5.908209359000466 +eigen,6,1,10000000,129,5.515909311001451 +pagerank,6,1,10000000,129,5.5220202620002965 +newman,6,1,10000000,129,5.576096300001154 +counting,6,2,10000000,129,5.464235542000097 +average_win_rate,6,2,10000000,129,5.463856446000136 +bradley_terry,6,2,10000000,129,5.478453263998745 +elo,6,2,10000000,129,5.844219248998343 +eigen,6,2,10000000,129,5.442783524000333 +pagerank,6,2,10000000,129,5.426823231000526 +newman,6,2,10000000,129,5.502706528999624 +counting,6,3,10000000,129,5.458096222000677 +average_win_rate,6,3,10000000,129,5.444172005001747 +bradley_terry,6,3,10000000,129,5.456373425000493 +elo,6,3,10000000,129,5.869286351000483 +eigen,6,3,10000000,129,5.475269013999423 +pagerank,6,3,10000000,129,5.436610488999577 +newman,6,3,10000000,129,5.534538046000307 +counting,6,4,10000000,129,5.479107171999203 +average_win_rate,6,4,10000000,129,5.489579851000599 +bradley_terry,6,4,10000000,129,5.636063873000239 +elo,6,4,10000000,129,5.888289676999193 +eigen,6,4,10000000,129,5.604227809000804 +pagerank,6,4,10000000,129,5.610040568999466 +newman,6,4,10000000,129,5.603313765001076 +counting,6,5,10000000,129,5.582531500998812 +average_win_rate,6,5,10000000,129,5.5502123330006725 +bradley_terry,6,5,10000000,129,5.623680609998701 +elo,6,5,10000000,129,6.005477833999976 +eigen,6,5,10000000,129,5.603588855999988 +pagerank,6,5,10000000,129,5.761078469000495 +newman,6,5,10000000,129,5.650248751999243 +counting,6,6,10000000,129,5.571540818000358 +average_win_rate,6,6,10000000,129,5.495064687000195 +bradley_terry,6,6,10000000,129,5.517097117999583 +elo,6,6,10000000,129,6.009124817999691 +eigen,6,6,10000000,129,5.523844548999477 +pagerank,6,6,10000000,129,5.611148018999302 +newman,6,6,10000000,129,5.515429049999511 +counting,6,7,10000000,129,5.448642265000672 +average_win_rate,6,7,10000000,129,5.448376202000873 +bradley_terry,6,7,10000000,129,5.544537397998283 +elo,6,7,10000000,129,6.017319021000731 +eigen,6,7,10000000,129,5.5518685650004045 +pagerank,6,7,10000000,129,5.52255630000036 +newman,6,7,10000000,129,5.605366936000792 +counting,6,8,10000000,129,5.7295167009997385 +average_win_rate,6,8,10000000,129,5.548453061001055 +bradley_terry,6,8,10000000,129,5.531914189001327 +elo,6,8,10000000,129,5.9395085369997105 +eigen,6,8,10000000,129,5.5266967409988865 +pagerank,6,8,10000000,129,5.570658406000803 +newman,6,8,10000000,129,5.672336362000351 +counting,6,9,10000000,129,5.492401758998312 +average_win_rate,6,9,10000000,129,5.495557521999217 +bradley_terry,6,9,10000000,129,5.47870610100108 +elo,6,9,10000000,129,5.9217117030002555 +eigen,6,9,10000000,129,5.511196595000001 +pagerank,6,9,10000000,129,5.505412172000433 +newman,6,9,10000000,129,5.620188545000929 diff --git a/coling2025/scale_compute.py b/coling2025/scale_compute.py new file mode 100755 index 0000000..8bfac75 --- /dev/null +++ b/coling2025/scale_compute.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +from functools import partial +from pathlib import Path +from timeit import timeit +from typing import TYPE_CHECKING, Any, cast + +import evalica +import pandas as pd +from scale_data import REPETITIONS, SCALE +from tqdm.auto import trange + +if TYPE_CHECKING: + from collections.abc import Callable + +ALGORITHMS = [ + evalica.counting, + evalica.average_win_rate, + evalica.bradley_terry, + evalica.elo, + evalica.eigen, + evalica.pagerank, + evalica.newman, +] + + +def main() -> None: + results = [] + + for scale in trange(SCALE, desc="scale"): + for i in range(REPETITIONS): + with (Path("scale") / f"scale_{scale}_{i}.parquet").open("rb") as f: + df_sample = pd.read_parquet(f) + + df_sample["winner"] = df_sample["winner"].map({ + "model_a": evalica.Winner.X, + "model_b": evalica.Winner.Y, + "tie": evalica.Winner.Draw, + "tie (bothbad)": evalica.Winner.Draw, + }) + + _, _, index = evalica.indexing(df_sample["model_a"], df_sample["model_b"]) + + for algorithm in ALGORITHMS: + stmt = partial( + cast("Callable[..., Any]", algorithm), + xs=df_sample["model_a"], + ys=df_sample["model_b"], + winners=df_sample["winner"], + index=index, + solver="pyo3", + ) + + time = timeit(stmt, number=1) + + results.append((algorithm.__name__, scale, i, len(df_sample), len(index), time)) + + df_results = pd.DataFrame(results, columns=["algorithm", "scale", "i", "rows", "models", "time"]) + df_results.to_csv("scale.csv", index=False) + + +if __name__ == "__main__": + main() diff --git a/coling2025/scale_data.py b/coling2025/scale_data.py new file mode 100755 index 0000000..e35ca2c --- /dev/null +++ b/coling2025/scale_data.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 + +from pathlib import Path + +import evalica +import pandas as pd +from tqdm.auto import trange + +SCALE = 7 + +REPETITIONS = 10 + + +def main() -> None: + df_arena = pd.read_json("clean_battle_20240814_public.json") + df_arena = df_arena[df_arena["anony"]] + df_arena = df_arena[df_arena["dedup_tag"].apply(lambda x: x.get("sampled", False))] + df_arena["evalica"] = df_arena["winner"].map({ + "model_a": evalica.Winner.X, + "model_b": evalica.Winner.Y, + "tie": evalica.Winner.Draw, + "tie (bothbad)": evalica.Winner.Draw, + }) + df_arena = df_arena[~df_arena["evalica"].isna()] + + for scale in trange(SCALE): + for i in range(REPETITIONS): + with (Path("scale") / f"scale_{scale}_{i}.parquet").open("wb") as f: + df_sample = df_arena.sample(n=10 ** (scale + 1), replace=True, random_state=scale * 10 + i) + df_sample[["model_a", "model_b", "winner"]].to_parquet(f, index=False) + + +if __name__ == "__main__": + main()