Skip to content

Benchmark

A benchmark within mteb is essentially just a list of tasks along with some metadata about the benchmark.

An overview of the benchmark within mteb

This metadata includes a short description of the benchmark's intention, the reference, and the citation. If you use a benchmark from mteb, we recommend that you cite it along with mteb.

Utilities

mteb.get_benchmarks(names=None, display_on_leaderboard=None)

Get a list of benchmarks by name.

Parameters:

Name Type Description Default
names list[str] | None

A list of benchmark names to retrieve. If None, all benchmarks are returned.

None
display_on_leaderboard bool | None

If specified, filters benchmarks by whether they are displayed on the leaderboard.

None

Returns:

Type Description
list[Benchmark]

A list of Benchmark instances.

Source code in mteb/benchmarks/get_benchmark.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def get_benchmarks(
    names: list[str] | None = None, display_on_leaderboard: bool | None = None
) -> list[Benchmark]:
    """Get a list of benchmarks by name.

    Args:
        names: A list of benchmark names to retrieve. If None, all benchmarks are returned.
        display_on_leaderboard: If specified, filters benchmarks by whether they are displayed on the leaderboard.

    Returns:
        A list of Benchmark instances.
    """
    benchmark_registry = _build_registry()

    if names is None:
        names = list(benchmark_registry.keys())
    benchmarks = [get_benchmark(name) for name in names]
    if display_on_leaderboard is not None:
        benchmarks = [
            b for b in benchmarks if b.display_on_leaderboard is display_on_leaderboard
        ]
    return benchmarks

mteb.get_benchmark(benchmark_name)

Get a benchmark by name.

Parameters:

Name Type Description Default
benchmark_name str

The name of the benchmark to retrieve.

required

Returns:

Type Description
Benchmark

The Benchmark instance corresponding to the given name.

Source code in mteb/benchmarks/get_benchmark.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def get_benchmark(
    benchmark_name: str,
) -> Benchmark:
    """Get a benchmark by name.

    Args:
        benchmark_name: The name of the benchmark to retrieve.

    Returns:
        The Benchmark instance corresponding to the given name.
    """
    benchmark_registry = _build_registry()
    aliases_registry = _build_aliases_registry()

    if benchmark_name in aliases_registry:
        return aliases_registry[benchmark_name]
    if benchmark_name not in benchmark_registry:
        close_matches = difflib.get_close_matches(
            benchmark_name, benchmark_registry.keys()
        )
        if close_matches:
            suggestion = f"KeyError: '{benchmark_name}' not found. Did you mean: {close_matches[0]}?"
        else:
            suggestion = f"KeyError: '{benchmark_name}' not found and no similar keys were found."
        raise KeyError(suggestion)
    return benchmark_registry[benchmark_name]

The Benchmark Object

mteb.Benchmark dataclass

A benchmark object intended to run a certain benchmark within MTEB.

Parameters:

Name Type Description Default
name str

The name of the benchmark

required
aliases Sequence[str]

Alternative names for the benchmark

tuple()
tasks Sequence[AbsTask]

The tasks within the benchmark.

required
description str | None

A description of the benchmark, should include its intended goal and potentially a description of its construction

None
reference StrURL | None

A link reference, to a source containing additional information typically to a paper, leaderboard or github.

None
citation str | None

A bibtex citation

None
contacts list[str] | None

The people to contact in case of a problem in the benchmark, preferably a GitHub handle.

None

Examples:

>>> Benchmark(
...     name="MTEB(custom)",
...     tasks=mteb.get_tasks(
...         tasks=["AmazonCounterfactualClassification", "AmazonPolarityClassification"],
...         languages=["eng"],
...     ),
...     description="A custom benchmark"
... )
Source code in mteb/benchmarks/benchmark.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
@dataclass
class Benchmark:
    """A benchmark object intended to run a certain benchmark within MTEB.

    Args:
        name: The name of the benchmark
        aliases: Alternative names for the benchmark
        tasks: The tasks within the benchmark.
        description: A description of the benchmark, should include its intended goal and potentially a description of its construction
        reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github.
        citation: A bibtex citation
        contacts: The people to contact in case of a problem in the benchmark, preferably a GitHub handle.

    Examples:
        >>> Benchmark(
        ...     name="MTEB(custom)",
        ...     tasks=mteb.get_tasks(
        ...         tasks=["AmazonCounterfactualClassification", "AmazonPolarityClassification"],
        ...         languages=["eng"],
        ...     ),
        ...     description="A custom benchmark"
        ... )
    """

    name: str
    tasks: Sequence[AbsTask]
    aliases: Sequence[str] = field(default_factory=tuple)
    description: str | None = None
    reference: StrURL | None = None
    citation: str | None = None
    contacts: list[str] | None = None
    display_on_leaderboard: bool = True
    icon: str | None = None
    display_name: str | None = None
    language_view: list[str] | Literal["all"] = field(default_factory=list)

    def __iter__(self) -> Iterator[AbsTask]:
        return iter(self.tasks)

    def __len__(self) -> int:
        return len(self.tasks)

    def __getitem__(self, index: int) -> AbsTask:
        return self.tasks[index]

    def _create_summary_table(
        self, benchmark_results: BenchmarkResults
    ) -> pd.DataFrame:
        """Create summary table. Called by the leaderboard app.

        Returns:
            A pandas DataFrame representing the summary results.
        """
        from mteb.benchmarks._create_table import (
            _create_summary_table_from_benchmark_results,
        )

        return _create_summary_table_from_benchmark_results(benchmark_results)

    def _create_per_task_table(
        self, benchmark_results: BenchmarkResults
    ) -> pd.DataFrame:
        """Create per-task table. Called by the leaderboard app.

        Returns:
            A pandas DataFrame representing the per-task results.
        """
        from mteb.benchmarks._create_table import (
            _create_per_task_table_from_benchmark_results,
        )

        return _create_per_task_table_from_benchmark_results(benchmark_results)

    def _create_per_language_table(
        self, benchmark_results: BenchmarkResults
    ) -> pd.DataFrame:
        """Create per-language table. Called by the leaderboard app.

        Returns:
            A pandas DataFrame representing the per-language results.
        """
        from mteb.benchmarks._create_table import (
            _create_per_language_table_from_benchmark_results,
        )

        if self.language_view == "all" or len(self.language_view) > 0:
            return _create_per_language_table_from_benchmark_results(
                benchmark_results, self.language_view
            )
        else:
            no_results_frame = pd.DataFrame(
                {
                    "No results": [
                        "The per-language table is not available for this benchmark."
                    ]
                }
            )
            return no_results_frame