Skip to content

Benchmark

A benchmark within mteb is essentially just a list of tasks along with some metadata about the benchmark.

An overview of the benchmark within mteb

This metadata includes a short description of the benchmark's intention, the reference, and the citation. If you use a benchmark from mteb, we recommend that you cite it along with mteb.

Utilities

mteb.get_benchmarks(names=None, display_on_leaderboard=None)

Get a list of benchmarks by name.

Parameters:

Name Type Description Default
names list[str] | None

A list of benchmark names to retrieve. If None, all benchmarks are returned.

None
display_on_leaderboard bool | None

If specified, filters benchmarks by whether they are displayed on the leaderboard.

None

Returns:

Type Description
list[Benchmark]

A list of Benchmark instances.

Source code in mteb/benchmarks/get_benchmark.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def get_benchmarks(
    names: list[str] | None = None, display_on_leaderboard: bool | None = None
) -> list[Benchmark]:
    """Get a list of benchmarks by name.

    Args:
        names: A list of benchmark names to retrieve. If None, all benchmarks are returned.
        display_on_leaderboard: If specified, filters benchmarks by whether they are displayed on the leaderboard.

    Returns:
        A list of Benchmark instances.
    """
    benchmark_registry = _build_registry()

    if names is None:
        names = list(benchmark_registry.keys())
    benchmarks = [get_benchmark(name) for name in names]
    if display_on_leaderboard is not None:
        benchmarks = [
            b for b in benchmarks if b.display_on_leaderboard is display_on_leaderboard
        ]
    return benchmarks

mteb.get_benchmark(benchmark_name)

Get a benchmark by name.

Parameters:

Name Type Description Default
benchmark_name str

The name of the benchmark to retrieve.

required

Returns:

Type Description
Benchmark

The Benchmark instance corresponding to the given name.

Source code in mteb/benchmarks/get_benchmark.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def get_benchmark(
    benchmark_name: str,
) -> Benchmark:
    """Get a benchmark by name.

    Args:
        benchmark_name: The name of the benchmark to retrieve.

    Returns:
        The Benchmark instance corresponding to the given name.
    """
    benchmark_registry = _build_registry()
    aliases_registry = _build_aliases_registry()

    if benchmark_name in aliases_registry:
        return aliases_registry[benchmark_name]
    if benchmark_name not in benchmark_registry:
        close_matches = difflib.get_close_matches(
            benchmark_name, benchmark_registry.keys()
        )
        if close_matches:
            suggestion = f"KeyError: '{benchmark_name}' not found. Did you mean: {close_matches[0]}?"
        else:
            suggestion = f"KeyError: '{benchmark_name}' not found and no similar keys were found."
        raise KeyError(suggestion)
    return benchmark_registry[benchmark_name]

The Benchmark Object

mteb.Benchmark dataclass

A benchmark object intended to run a certain benchmark within MTEB.

Parameters:

Name Type Description Default
name str

The name of the benchmark

required
aliases Sequence[str]

Alternative names for the benchmark

tuple()
tasks Sequence[AbsTask]

The tasks within the benchmark.

required
description str | None

A description of the benchmark, should include its intended goal and potentially a description of its construction

None
reference StrURL | None

A link reference, to a source containing additional information typically to a paper, leaderboard or github.

None
citation str | None

A bibtex citation

None
contacts list[str] | None

The people to contact in case of a problem in the benchmark, preferably a GitHub handle.

None

Examples:

>>> Benchmark(
...     name="MTEB(custom)",
...     tasks=mteb.get_tasks(
...         tasks=["AmazonCounterfactualClassification", "AmazonPolarityClassification"],
...         languages=["eng"],
...     ),
...     description="A custom benchmark"
... )
Source code in mteb/benchmarks/benchmark.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
@dataclass
class Benchmark:
    """A benchmark object intended to run a certain benchmark within MTEB.

    Args:
        name: The name of the benchmark
        aliases: Alternative names for the benchmark
        tasks: The tasks within the benchmark.
        description: A description of the benchmark, should include its intended goal and potentially a description of its construction
        reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github.
        citation: A bibtex citation
        contacts: The people to contact in case of a problem in the benchmark, preferably a GitHub handle.

    Examples:
        >>> Benchmark(
        ...     name="MTEB(custom)",
        ...     tasks=mteb.get_tasks(
        ...         tasks=["AmazonCounterfactualClassification", "AmazonPolarityClassification"],
        ...         languages=["eng"],
        ...     ),
        ...     description="A custom benchmark"
        ... )
    """

    name: str
    tasks: Sequence[AbsTask]
    aliases: Sequence[str] = field(default_factory=tuple)
    description: str | None = None
    reference: StrURL | None = None
    citation: str | None = None
    contacts: list[str] | None = None
    display_on_leaderboard: bool = True
    icon: str | None = None
    display_name: str | None = None
    language_view: list[str] | Literal["all"] = field(default_factory=list)

    def __iter__(self) -> Iterator[AbsTask]:
        return iter(self.tasks)

    def __len__(self) -> int:
        return len(self.tasks)

    def __getitem__(self, index: int) -> AbsTask:
        return self.tasks[index]

    def _create_summary_table(
        self, benchmark_results: BenchmarkResults
    ) -> pd.DataFrame:
        """Create summary table. Called by the leaderboard app.

        Returns:
            A pandas DataFrame representing the summary results.
        """
        from mteb.benchmarks._create_table import (
            _create_summary_table_from_benchmark_results,
        )

        return _create_summary_table_from_benchmark_results(benchmark_results)

    def _create_per_task_table(
        self, benchmark_results: BenchmarkResults
    ) -> pd.DataFrame:
        """Create per-task table. Called by the leaderboard app.

        Returns:
            A pandas DataFrame representing the per-task results.
        """
        from mteb.benchmarks._create_table import (
            _create_per_task_table_from_benchmark_results,
        )

        return _create_per_task_table_from_benchmark_results(benchmark_results)

    def _create_per_language_table(
        self, benchmark_results: BenchmarkResults
    ) -> pd.DataFrame:
        """Create per-language table. Called by the leaderboard app.

        Returns:
            A pandas DataFrame representing the per-language results.
        """
        from mteb.benchmarks._create_table import (
            _create_per_language_table_from_benchmark_results,
        )

        if self.language_view == "all" or len(self.language_view) > 0:
            return _create_per_language_table_from_benchmark_results(
                benchmark_results, self.language_view
            )
        else:
            no_results_frame = pd.DataFrame(
                {
                    "No results": [
                        "The per-language table is not available for this benchmark."
                    ]
                }
            )
            return no_results_frame

    def push_collection_to_hub(
        self,
        hf_username: str,
        collection_name: str | None = None,
    ) -> None:
        """Push the benchmark collection to Hugging Face Hub.

        Args:
            hf_username: Hugging Face username or organization name
            collection_name: Name for the collection on Hugging Face Hub. If not provided, the benchmark name will be used.
        """
        collections = huggingface_hub.list_collections(owner=hf_username)
        collection_name = collection_name or self.name
        existing_collection = None
        for collection in collections:
            if collection.title == collection_name:
                existing_collection = collection
                break

        if existing_collection is None:
            description = self.description
            if description and len(description) > 150:
                description = description[:147] + "..."
            collection = huggingface_hub.create_collection(
                title=collection_name,
                namespace=hf_username,
                # hf collections have a 150 character limit for description, so we truncate it if it's too long
                description=description if description else None,
            )
        else:
            # list collections would output only 4 items
            collection = huggingface_hub.get_collection(
                collection_slug=existing_collection.slug
            )

        existing_items = {item.item_id for item in collection.items}

        for task in self.tasks:
            tasks = (
                cast("AbsTaskAggregate", task).tasks if task.is_aggregate else [task]
            )
            for benchmark_task in tasks:
                task_path = benchmark_task.metadata.dataset["path"]
                if task_path in existing_items:
                    continue
                huggingface_hub.add_collection_item(
                    collection_slug=collection.slug,
                    item_id=task_path,
                    item_type="dataset",
                )
                existing_items.add(task_path)

push_collection_to_hub(hf_username, collection_name=None)

Push the benchmark collection to Hugging Face Hub.

Parameters:

Name Type Description Default
hf_username str

Hugging Face username or organization name

required
collection_name str | None

Name for the collection on Hugging Face Hub. If not provided, the benchmark name will be used.

None
Source code in mteb/benchmarks/benchmark.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def push_collection_to_hub(
    self,
    hf_username: str,
    collection_name: str | None = None,
) -> None:
    """Push the benchmark collection to Hugging Face Hub.

    Args:
        hf_username: Hugging Face username or organization name
        collection_name: Name for the collection on Hugging Face Hub. If not provided, the benchmark name will be used.
    """
    collections = huggingface_hub.list_collections(owner=hf_username)
    collection_name = collection_name or self.name
    existing_collection = None
    for collection in collections:
        if collection.title == collection_name:
            existing_collection = collection
            break

    if existing_collection is None:
        description = self.description
        if description and len(description) > 150:
            description = description[:147] + "..."
        collection = huggingface_hub.create_collection(
            title=collection_name,
            namespace=hf_username,
            # hf collections have a 150 character limit for description, so we truncate it if it's too long
            description=description if description else None,
        )
    else:
        # list collections would output only 4 items
        collection = huggingface_hub.get_collection(
            collection_slug=existing_collection.slug
        )

    existing_items = {item.item_id for item in collection.items}

    for task in self.tasks:
        tasks = (
            cast("AbsTaskAggregate", task).tasks if task.is_aggregate else [task]
        )
        for benchmark_task in tasks:
            task_path = benchmark_task.metadata.dataset["path"]
            if task_path in existing_items:
                continue
            huggingface_hub.add_collection_item(
                collection_slug=collection.slug,
                item_id=task_path,
                item_type="dataset",
            )
            existing_items.add(task_path)