Skip to content

Commit cb03bd4

Browse files
fix: Refactor split create_tables into static Benchmark methods (#3126)
* feat - Split create_tables into static Benchmark methods * feat - format * Update mteb/leaderboard/table.py Co-authored-by: Kenneth Enevoldsen <[email protected]> * feat - remove search query;take benchmark result as input;addressing the circular import, * feat - format * Update mteb/benchmarks/benchmark.py Co-authored-by: Kenneth Enevoldsen <[email protected]> * Update mteb/benchmarks/benchmark.py Co-authored-by: Kenneth Enevoldsen <[email protected]> * feat - use to_dataframe;clean table.py;move creat_table * feat - fix circular import * feat - clean-up * feat - format --------- Co-authored-by: Kenneth Enevoldsen <[email protected]>
1 parent 15f9909 commit cb03bd4

File tree

5 files changed

+435
-255
lines changed

5 files changed

+435
-255
lines changed

mteb/benchmarks/_create_table.py

Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
from __future__ import annotations
2+
3+
import math
4+
import re
5+
from collections import defaultdict
6+
7+
import numpy as np
8+
import pandas as pd
9+
10+
from mteb.load_results.benchmark_results import BenchmarkResults
11+
from mteb.overview import get_task, get_tasks
12+
13+
14+
def _borda_count(scores: pd.Series) -> pd.Series:
15+
n = len(scores)
16+
ranks = scores.rank(method="average", ascending=False)
17+
counts = n - ranks
18+
return counts
19+
20+
21+
def _get_borda_rank(score_table: pd.DataFrame) -> pd.Series:
22+
borda_counts = score_table.apply(_borda_count, axis="index")
23+
mean_borda = borda_counts.sum(axis=1)
24+
return mean_borda.rank(method="min", ascending=False).astype(int)
25+
26+
27+
def _split_on_capital(s: str) -> str:
28+
"""Splits on capital letters and joins with spaces"""
29+
return " ".join(re.findall(r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)", s))
30+
31+
32+
def _format_n_parameters(n_parameters) -> str:
33+
if (n_parameters is None) or (not int(n_parameters)):
34+
return "Unknown"
35+
n_thousand = int(n_parameters // 1e3)
36+
if n_thousand < 1:
37+
return str(int(n_parameters))
38+
n_zeros = math.log10(n_thousand)
39+
if n_zeros >= 6:
40+
return str(n_thousand // (10**6)) + "B"
41+
if n_zeros >= 3:
42+
return str(n_thousand // (10**3)) + "M"
43+
return str(n_thousand) + "K"
44+
45+
46+
def _format_max_tokens(max_tokens: float | None) -> str:
47+
if max_tokens is None:
48+
return "Unknown"
49+
if max_tokens == np.inf:
50+
return "Infinite"
51+
return str(int(max_tokens))
52+
53+
54+
def _failsafe_get_model_meta(model_name):
55+
try:
56+
from mteb.models.overview import get_model_meta
57+
58+
return get_model_meta(model_name)
59+
except Exception:
60+
return None
61+
62+
63+
def _get_means_per_types(per_task: pd.DataFrame):
64+
task_names_per_type = defaultdict(list)
65+
for task_name in per_task.columns:
66+
task_type = get_task(task_name).metadata.type
67+
task_names_per_type[task_type].append(task_name)
68+
records = []
69+
for task_type, tasks in task_names_per_type.items():
70+
for model_name, scores in per_task.iterrows():
71+
records.append(
72+
dict(
73+
model_name=model_name,
74+
task_type=task_type,
75+
score=scores[tasks].mean(skipna=False),
76+
)
77+
)
78+
return pd.DataFrame.from_records(records)
79+
80+
81+
def _create_summary_table_from_benchmark_results(
82+
benchmark_results: BenchmarkResults,
83+
) -> pd.DataFrame:
84+
"""Create summary table from BenchmarkResults.
85+
86+
Returns a DataFrame with one row per model containing summary statistics
87+
and task type averages.
88+
89+
Args:
90+
benchmark_results: BenchmarkResults object containing model results
91+
92+
Returns:
93+
DataFrame with model summaries, ready for styling in the leaderboard
94+
"""
95+
data = benchmark_results.to_dataframe(format="long")
96+
97+
if data.empty:
98+
no_results_frame = pd.DataFrame(
99+
{"No results": ["You can try relaxing your criteria"]}
100+
)
101+
return no_results_frame
102+
103+
# Convert to DataFrame and pivot
104+
per_task = data.pivot(index="model_name", columns="task_name", values="score")
105+
106+
# Remove models with no scores
107+
to_remove = per_task.isna().all(axis="columns")
108+
if to_remove.all():
109+
no_results_frame = pd.DataFrame(
110+
{"No results": ["You can try relaxing your criteria"]}
111+
)
112+
return no_results_frame
113+
114+
models_to_remove = list(per_task[to_remove].index)
115+
per_task = per_task.drop(models_to_remove, axis=0)
116+
117+
# Calculate means by task type
118+
mean_per_type = _get_means_per_types(per_task)
119+
mean_per_type = mean_per_type.pivot(
120+
index="model_name", columns="task_type", values="score"
121+
)
122+
mean_per_type.columns = [
123+
_split_on_capital(column) for column in mean_per_type.columns
124+
]
125+
126+
# Calculate overall means
127+
typed_mean = mean_per_type.mean(skipna=False, axis=1)
128+
overall_mean = per_task.mean(skipna=False, axis=1)
129+
130+
# Build joint table
131+
joint_table = mean_per_type.copy()
132+
joint_table = joint_table.drop(models_to_remove, axis=0)
133+
joint_table.insert(0, "mean", overall_mean)
134+
joint_table.insert(1, "mean_by_task_type", typed_mean)
135+
joint_table["borda_rank"] = _get_borda_rank(per_task)
136+
joint_table = joint_table.sort_values("borda_rank", ascending=True)
137+
joint_table = joint_table.reset_index()
138+
139+
# Add model metadata
140+
model_metas = joint_table["model_name"].map(_failsafe_get_model_meta)
141+
joint_table = joint_table[model_metas.notna()]
142+
joint_table["model_link"] = model_metas.map(lambda m: m.reference)
143+
144+
# Insert model metadata columns
145+
joint_table.insert(
146+
1,
147+
"Max Tokens",
148+
model_metas.map(lambda m: _format_max_tokens(m.max_tokens)),
149+
)
150+
joint_table.insert(
151+
1,
152+
"Embedding Dimensions",
153+
model_metas.map(lambda m: str(int(m.embed_dim)) if m.embed_dim else "Unknown"),
154+
)
155+
joint_table.insert(
156+
1,
157+
"Number of Parameters",
158+
model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
159+
)
160+
joint_table.insert(
161+
1,
162+
"Memory Usage (MB)",
163+
model_metas.map(
164+
lambda m: str(int(m.memory_usage_mb)) if m.memory_usage_mb else "Unknown"
165+
),
166+
)
167+
168+
# Add zero-shot percentage
169+
tasks = get_tasks(tasks=list(data["task_name"].unique()))
170+
joint_table.insert(
171+
1, "Zero-shot", model_metas.map(lambda m: m.zero_shot_percentage(tasks))
172+
)
173+
joint_table["Zero-shot"] = joint_table["Zero-shot"].fillna(-1)
174+
175+
# Clean up model names (remove HF organization)
176+
joint_table["model_name"] = joint_table["model_name"].map(
177+
lambda name: name.split("/")[-1]
178+
)
179+
180+
# Add markdown links to model names
181+
name_w_link = (
182+
"[" + joint_table["model_name"] + "](" + joint_table["model_link"] + ")"
183+
)
184+
joint_table["model_name"] = joint_table["model_name"].mask(
185+
joint_table["model_link"].notna(), name_w_link
186+
)
187+
joint_table = joint_table.drop(columns=["model_link"])
188+
189+
# Rename columns
190+
joint_table = joint_table.rename(
191+
columns={
192+
"model_name": "Model",
193+
"mean_by_task_type": "Mean (TaskType)",
194+
"mean": "Mean (Task)",
195+
}
196+
)
197+
198+
# Move borda rank to front
199+
joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank"))
200+
201+
return joint_table
202+
203+
204+
def _create_per_task_table_from_benchmark_results(
205+
benchmark_results: BenchmarkResults,
206+
) -> pd.DataFrame:
207+
"""Create per-task table from BenchmarkResults.
208+
209+
Returns a DataFrame with one row per model and one column per task.
210+
211+
Args:
212+
benchmark_results: BenchmarkResults object containing model results
213+
214+
Returns:
215+
DataFrame with per-task scores, ready for styling in the leaderboard
216+
"""
217+
# Get scores in long format
218+
data = benchmark_results.to_dataframe(format="long")
219+
220+
if data.empty:
221+
no_results_frame = pd.DataFrame(
222+
{"No results": ["You can try relaxing your criteria"]}
223+
)
224+
return no_results_frame
225+
226+
# Convert to DataFrame and pivot
227+
per_task = data.pivot(index="model_name", columns="task_name", values="score")
228+
229+
# Remove models with no scores
230+
to_remove = per_task.isna().all(axis="columns")
231+
if to_remove.all():
232+
no_results_frame = pd.DataFrame(
233+
{"No results": ["You can try relaxing your criteria"]}
234+
)
235+
return no_results_frame
236+
237+
models_to_remove = list(per_task[to_remove].index)
238+
per_task = per_task.drop(models_to_remove, axis=0)
239+
240+
# Add borda rank and sort
241+
per_task["borda_rank"] = _get_borda_rank(per_task)
242+
per_task = per_task.sort_values("borda_rank", ascending=True)
243+
per_task = per_task.drop(columns=["borda_rank"])
244+
per_task = per_task.reset_index()
245+
246+
# Clean up model names (remove HF organization)
247+
per_task["model_name"] = per_task["model_name"].map(
248+
lambda name: name.split("/")[-1]
249+
)
250+
per_task = per_task.rename(
251+
columns={
252+
"model_name": "Model",
253+
}
254+
)
255+
256+
return per_task

mteb/benchmarks/benchmark.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,13 @@
44
from dataclasses import dataclass
55
from typing import TYPE_CHECKING, Annotated
66

7+
import pandas as pd
78
from pydantic import AnyUrl, BeforeValidator, TypeAdapter
89

10+
from mteb.benchmarks._create_table import (
11+
_create_per_task_table_from_benchmark_results,
12+
_create_summary_table_from_benchmark_results,
13+
)
914
from mteb.load_results.load_results import load_results
1015

1116
if TYPE_CHECKING:
@@ -72,3 +77,15 @@ def load_results(
7277
results = base_results.select_tasks(self.tasks)
7378
self.results_cache[base_results] = results
7479
return results
80+
81+
def _create_summary_table(
82+
self, benchmark_results: BenchmarkResults
83+
) -> pd.DataFrame:
84+
"""Create summary table. Called by the leaderboard app."""
85+
return _create_summary_table_from_benchmark_results(benchmark_results)
86+
87+
def _create_per_task_table(
88+
self, benchmark_results: BenchmarkResults
89+
) -> pd.DataFrame:
90+
"""Create per-task table. Called by the leaderboard app."""
91+
return _create_per_task_table_from_benchmark_results(benchmark_results)

mteb/leaderboard/app.py

Lines changed: 54 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@
2525
make_selector,
2626
)
2727
from mteb.leaderboard.figures import performance_size_plot, radar_chart
28-
from mteb.leaderboard.table import create_tables
28+
from mteb.leaderboard.table import (
29+
apply_per_task_styling_from_benchmark,
30+
apply_summary_styling_from_benchmark,
31+
)
2932
from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ
3033

3134
logger = logging.getLogger(__name__)
@@ -236,10 +239,21 @@ def get_leaderboard_app() -> gr.Blocks:
236239
max_model_size=MAX_MODEL_SIZE,
237240
zero_shot_setting="allow_all",
238241
)
242+
default_filtered_scores = [
243+
entry for entry in default_scores if entry["model_name"] in filtered_models
244+
]
245+
246+
# Filter BenchmarkResults based on default filtered models (as required by Kenneth)
247+
filtered_model_names = [entry["model_name"] for entry in default_filtered_scores]
248+
filtered_benchmark_results = default_results.select_models(filtered_model_names)
239249

240-
summary_table, per_task_table = create_tables(
241-
[entry for entry in default_scores if entry["model_name"] in filtered_models]
250+
summary_table = apply_summary_styling_from_benchmark(
251+
default_benchmark, filtered_benchmark_results
252+
)
253+
per_task_table = apply_per_task_styling_from_benchmark(
254+
default_benchmark, filtered_benchmark_results
242255
)
256+
243257
lang_select = gr.Dropdown(
244258
LANGUAGE,
245259
value=sorted(default_results.languages),
@@ -774,19 +788,43 @@ def update_tables(
774788
tasks = set(tasks)
775789
benchmark = mteb.get_benchmark(benchmark_name)
776790
benchmark_tasks = {task.metadata.name for task in benchmark.tasks}
777-
if (benchmark_tasks != tasks) or (models_to_keep is not None):
778-
filtered_scores = []
779-
for entry in scores:
780-
if entry["task_name"] not in tasks:
781-
continue
782-
if (models_to_keep is not None) and (
783-
entry["model_name"] not in models_to_keep
784-
):
785-
continue
786-
filtered_scores.append(entry)
787-
else:
788-
filtered_scores = scores
789-
summary, per_task = create_tables(filtered_scores)
791+
792+
# Extract filtered model and task names from scores (respects UI filters)
793+
filtered_model_names = set()
794+
filtered_task_names = set()
795+
796+
for entry in scores:
797+
if entry["task_name"] not in tasks:
798+
continue
799+
if (models_to_keep is not None) and (
800+
entry["model_name"] not in models_to_keep
801+
):
802+
continue
803+
filtered_model_names.add(entry["model_name"])
804+
filtered_task_names.add(entry["task_name"])
805+
806+
# Create filtered BenchmarkResults as required by Kenneth
807+
benchmark_results = all_benchmark_results[benchmark_name]
808+
filtered_benchmark_results = benchmark_results
809+
810+
# Apply task filtering if needed
811+
if filtered_task_names != benchmark_tasks:
812+
filtered_benchmark_results = filtered_benchmark_results.filter_tasks(
813+
task_names=list(filtered_task_names)
814+
)
815+
816+
# Apply model filtering if needed
817+
if filtered_model_names:
818+
filtered_benchmark_results = filtered_benchmark_results.select_models(
819+
list(filtered_model_names)
820+
)
821+
822+
summary = apply_summary_styling_from_benchmark(
823+
benchmark, filtered_benchmark_results
824+
)
825+
per_task = apply_per_task_styling_from_benchmark(
826+
benchmark, filtered_benchmark_results
827+
)
790828
elapsed = time.time() - start_time
791829
logger.debug(f"update_tables callback: {elapsed}s")
792830
return summary, per_task

0 commit comments

Comments
 (0)