Skip to content

Benchmark

A benchmark within mteb is essentially just a list of tasks along with some metadata about the benchmark.

An overview of the benchmark within mteb

This metadata includes a short description of the benchmark's intention, the reference, and the citation. If you use a benchmark from mteb, we recommend that you cite it along with mteb.

Utilities

mteb.get_benchmarks(names=None, display_on_leaderboard=None)

Get a list of benchmarks by name.

Parameters:

Name Type Description Default
names list[str] | None

A list of benchmark names to retrieve. If None, all benchmarks are returned.

None
display_on_leaderboard bool | None

If specified, filters benchmarks by whether they are displayed on the leaderboard.

None
Source code in mteb/benchmarks/get_benchmark.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def get_benchmarks(
    names: list[str] | None = None, display_on_leaderboard: bool | None = None
) -> list[Benchmark]:
    """Get a list of benchmarks by name.

    Args:
        names: A list of benchmark names to retrieve. If None, all benchmarks are returned.
        display_on_leaderboard: If specified, filters benchmarks by whether they are displayed on the leaderboard.
    """
    benchmark_registry = _build_registry()

    if names is None:
        names = list(benchmark_registry.keys())
    benchmarks = [get_benchmark(name) for name in names]
    if display_on_leaderboard is not None:
        benchmarks = [
            b for b in benchmarks if b.display_on_leaderboard is display_on_leaderboard
        ]
    return benchmarks

mteb.get_benchmark(benchmark_name)

Get a benchmark by name.

Parameters:

Name Type Description Default
benchmark_name str

The name of the benchmark to retrieve.

required
Source code in mteb/benchmarks/get_benchmark.py
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def get_benchmark(
    benchmark_name: str,
) -> Benchmark:
    """Get a benchmark by name.

    Args:
        benchmark_name: The name of the benchmark to retrieve.
    """
    previous_benchmark_names = _get_previous_benchmark_names()
    benchmark_registry = _build_registry()
    if benchmark_name in previous_benchmark_names:
        warnings.warn(
            f"Using the previous benchmark name '{benchmark_name}' is deprecated. Please use '{previous_benchmark_names[benchmark_name]}' instead.",
            DeprecationWarning,
        )
        benchmark_name = previous_benchmark_names[benchmark_name]
    if benchmark_name not in benchmark_registry:
        close_matches = difflib.get_close_matches(
            benchmark_name, benchmark_registry.keys()
        )
        if close_matches:
            suggestion = f"KeyError: '{benchmark_name}' not found. Did you mean: {close_matches[0]}?"
        else:
            suggestion = f"KeyError: '{benchmark_name}' not found and no similar keys were found."
        raise KeyError(suggestion)
    return benchmark_registry[benchmark_name]

The Benchmark Object

mteb.Benchmark dataclass

A benchmark object intended to run a certain benchmark within MTEB.

Parameters:

Name Type Description Default
name str

The name of the benchmark

required
tasks Sequence[AbsTask]

The tasks within the benchmark.

required
description str | None

A description of the benchmark, should include its intended goal and potentially a description of its construction

None
reference StrURL | None

A link reference, to a source containing additional information typically to a paper, leaderboard or github.

None
citation str | None

A bibtex citation

None
contacts list[str] | None

The people to contact in case of a problem in the benchmark, preferably a GitHub handle.

None

Examples:

>>> Benchmark(
...     name="MTEB(custom)",
...     tasks=mteb.get_tasks(
...         tasks=["AmazonCounterfactualClassification", "AmazonPolarityClassification"],
...         languages=["eng"],
...     ),
...     description="A custom benchmark"
... )
Source code in mteb/benchmarks/benchmark.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
@dataclass
class Benchmark:
    """A benchmark object intended to run a certain benchmark within MTEB.

    Args:
        name: The name of the benchmark
        tasks: The tasks within the benchmark.
        description: A description of the benchmark, should include its intended goal and potentially a description of its construction
        reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github.
        citation: A bibtex citation
        contacts: The people to contact in case of a problem in the benchmark, preferably a GitHub handle.

    Examples:
        >>> Benchmark(
        ...     name="MTEB(custom)",
        ...     tasks=mteb.get_tasks(
        ...         tasks=["AmazonCounterfactualClassification", "AmazonPolarityClassification"],
        ...         languages=["eng"],
        ...     ),
        ...     description="A custom benchmark"
        ... )
    """

    name: str
    tasks: Sequence[AbsTask]
    description: str | None = None
    reference: StrURL | None = None
    citation: str | None = None
    contacts: list[str] | None = None
    display_on_leaderboard: bool = True
    icon: str | None = None
    display_name: str | None = None

    def __iter__(self) -> Iterable[AbsTask]:
        return iter(self.tasks)

    def __len__(self) -> int:
        return len(self.tasks)

    def __getitem__(self, index: int) -> AbsTask:
        return self.tasks[index]

    def load_results(
        self, base_results: BenchmarkResults | None = None
    ) -> BenchmarkResults:
        if not hasattr(self, "results_cache"):
            self.results_cache = {}
        if base_results in self.results_cache:
            return self.results_cache[base_results]
        if base_results is None:
            base_results = load_results()
        results = base_results.select_tasks(self.tasks)
        self.results_cache[base_results] = results
        return results

    def _create_summary_table(
        self, benchmark_results: BenchmarkResults
    ) -> pd.DataFrame:
        """Create summary table. Called by the leaderboard app."""
        return _create_summary_table_from_benchmark_results(benchmark_results)

    def _create_per_task_table(
        self, benchmark_results: BenchmarkResults
    ) -> pd.DataFrame:
        """Create per-task table. Called by the leaderboard app."""
        return _create_per_task_table_from_benchmark_results(benchmark_results)