Skip to content

Benchmark

A benchmark within mteb is essentially just a list of tasks along with some metadata about the benchmark.

An overview of the benchmark within mteb

This metadata includes a short description of the benchmark's intention, the reference, and the citation. If you use a benchmark from mteb, we recommend that you cite it along with mteb.

Utilities

mteb.get_benchmarks(names=None, display_on_leaderboard=None)

Get a list of benchmarks by name.

Parameters:

Name Type Description Default
names list[str] | None

A list of benchmark names to retrieve. If None, all benchmarks are returned.

None
display_on_leaderboard bool | None

If specified, filters benchmarks by whether they are displayed on the leaderboard.

None

Returns:

Type Description
list[Benchmark]

A list of Benchmark instances.

Source code in mteb/benchmarks/get_benchmark.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def get_benchmarks(
    names: list[str] | None = None, display_on_leaderboard: bool | None = None
) -> list[Benchmark]:
    """Get a list of benchmarks by name.

    Args:
        names: A list of benchmark names to retrieve. If None, all benchmarks are returned.
        display_on_leaderboard: If specified, filters benchmarks by whether they are displayed on the leaderboard.

    Returns:
        A list of Benchmark instances.
    """
    benchmark_registry = _build_registry()

    if names is None:
        names = list(benchmark_registry.keys())
    benchmarks = [get_benchmark(name) for name in names]
    if display_on_leaderboard is not None:
        benchmarks = [
            b for b in benchmarks if b.display_on_leaderboard is display_on_leaderboard
        ]
    return benchmarks

mteb.get_benchmark(benchmark_name)

Get a benchmark by name.

Parameters:

Name Type Description Default
benchmark_name str

The name of the benchmark to retrieve.

required

Returns:

Type Description
Benchmark

The Benchmark instance corresponding to the given name.

Source code in mteb/benchmarks/get_benchmark.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def get_benchmark(
    benchmark_name: str,
) -> Benchmark:
    """Get a benchmark by name.

    Args:
        benchmark_name: The name of the benchmark to retrieve.

    Returns:
        The Benchmark instance corresponding to the given name.
    """
    benchmark_registry = _build_registry()
    aliases_registry = _build_aliases_registry()

    if benchmark_name in aliases_registry:
        return aliases_registry[benchmark_name]
    if benchmark_name not in benchmark_registry:
        close_matches = difflib.get_close_matches(
            benchmark_name, benchmark_registry.keys()
        )
        if close_matches:
            suggestion = (
                f"'{benchmark_name}' not found. Did you mean: {close_matches[0]}?"
            )
        else:
            suggestion = f"'{benchmark_name}' not found and no similar keys were found."
        raise KeyError(suggestion)
    return benchmark_registry[benchmark_name]

The Benchmark Object

mteb.Benchmark dataclass

A benchmark object intended to run a certain benchmark within MTEB.

Parameters:

Name Type Description Default
name str

The name of the benchmark

required
aliases Sequence[str]

Alternative names for the benchmark

tuple()
tasks Sequence[AbsTask]

The tasks within the benchmark.

required
description str | None

A description of the benchmark, should include its intended goal and potentially a description of its construction

None
reference StrURL | None

A link reference, to a source containing additional information typically to a paper, leaderboard or github.

None
citation str | None

A bibtex citation

None
contacts list[str] | None

The people to contact in case of a problem in the benchmark, preferably a GitHub handle.

None

Examples:

>>> Benchmark(
...     name="MTEB(custom)",
...     tasks=mteb.get_tasks(
...         tasks=["AmazonCounterfactualClassification", "AmazonPolarityClassification"],
...         languages=["eng"],
...     ),
...     description="A custom benchmark"
... )
Source code in mteb/benchmarks/benchmark.py
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
@dataclass
class Benchmark:
    """A benchmark object intended to run a certain benchmark within MTEB.

    Args:
        name: The name of the benchmark
        aliases: Alternative names for the benchmark
        tasks: The tasks within the benchmark.
        description: A description of the benchmark, should include its intended goal and potentially a description of its construction
        reference: A link reference, to a source containing additional information typically to a paper, leaderboard or github.
        citation: A bibtex citation
        contacts: The people to contact in case of a problem in the benchmark, preferably a GitHub handle.

    Examples:
        >>> Benchmark(
        ...     name="MTEB(custom)",
        ...     tasks=mteb.get_tasks(
        ...         tasks=["AmazonCounterfactualClassification", "AmazonPolarityClassification"],
        ...         languages=["eng"],
        ...     ),
        ...     description="A custom benchmark"
        ... )
    """

    name: str
    tasks: Sequence[AbsTask]
    aliases: Sequence[str] = field(default_factory=tuple)
    description: str | None = None
    reference: StrURL | None = None
    citation: str | None = None
    contacts: list[str] | None = None
    icon: str | None = None
    display_name: str | None = None
    language_view: list[str] | Literal["all"] = field(default_factory=list)
    benchmark_hf_repo: str | None = None

    @property
    def display_on_leaderboard(self) -> bool:
        """Whether the benchmark should be displayed on the leaderboard."""
        benchmarks_on_leaderboard = _get_benchmarks_on_leaderboard()
        return self.name in benchmarks_on_leaderboard

    def __iter__(self) -> Iterator[AbsTask]:
        return iter(self.tasks)

    def __len__(self) -> int:
        return len(self.tasks)

    def __getitem__(self, index: int) -> AbsTask:
        return self.tasks[index]

    def _create_summary_table(  # noqa: PLR6301
        self, benchmark_results: BenchmarkResults
    ) -> pd.DataFrame:
        """Create summary table. Called by the leaderboard app.

        Returns:
            A pandas DataFrame representing the summary results.
        """
        from mteb.benchmarks._create_table import (
            _create_summary_table_from_benchmark_results,
        )

        return _create_summary_table_from_benchmark_results(benchmark_results)

    def _create_per_task_table(  # noqa: PLR6301
        self, benchmark_results: BenchmarkResults
    ) -> pd.DataFrame:
        """Create per-task table. Called by the leaderboard app.

        Returns:
            A pandas DataFrame representing the per-task results.
        """
        from mteb.benchmarks._create_table import (
            _create_per_task_table_from_benchmark_results,
        )

        return _create_per_task_table_from_benchmark_results(benchmark_results)

    def _create_per_language_table(
        self, benchmark_results: BenchmarkResults
    ) -> pd.DataFrame:
        """Create per-language table. Called by the leaderboard app.

        Returns:
            A pandas DataFrame representing the per-language results.
        """
        from mteb.benchmarks._create_table import (
            _create_per_language_table_from_benchmark_results,
        )

        if self.language_view == "all" or len(self.language_view) > 0:
            return _create_per_language_table_from_benchmark_results(
                benchmark_results, self.language_view
            )
        else:
            no_results_frame = pd.DataFrame(
                {
                    "No results": [
                        "The per-language table is not available for this benchmark."
                    ]
                }
            )
            return no_results_frame

    def push_collection_to_hub(
        self,
        hf_username: str,
        collection_name: str | None = None,
    ) -> None:
        """Push the benchmark collection to Hugging Face Hub.

        Args:
            hf_username: Hugging Face username or organization name
            collection_name: Name for the collection on Hugging Face Hub. If not provided, the benchmark name will be used.
        """
        collections = huggingface_hub.list_collections(owner=hf_username)
        collection_name = collection_name or self.name
        existing_collection = None
        for collection in collections:
            if collection.title == collection_name:
                existing_collection = collection
                break

        if existing_collection is None:
            description = self.description
            if description and len(description) > 150:
                description = description[:147] + "..."
            collection = huggingface_hub.create_collection(
                title=collection_name,
                namespace=hf_username,
                # hf collections have a 150 character limit for description, so we truncate it if it's too long
                description=description if description else None,
            )
        else:
            # list collections would output only 4 items
            collection = huggingface_hub.get_collection(
                collection_slug=existing_collection.slug
            )

        existing_items = {item.item_id for item in collection.items}

        for task in self.tasks:
            tasks = (
                cast("AbsTaskAggregate", task).tasks if task.is_aggregate else [task]
            )
            for benchmark_task in tasks:
                task_path = benchmark_task.metadata.dataset["path"]
                if task_path in existing_items:
                    continue
                huggingface_hub.add_collection_item(
                    collection_slug=collection.slug,
                    item_id=task_path,
                    item_type="dataset",
                )
                existing_items.add(task_path)

    def __repr__(self) -> str:
        n_tasks = len(self.tasks)
        max_len = 50
        desc = self.description if self.description else ""
        desc = f"'{desc[:max_len]}..." if len(desc) > max_len else f"'{desc}'"
        return f"{self.__class__.__name__}(name='{self.name}', description={desc}, tasks=[...] (#{n_tasks}), ...)"

    def _generate_benchmark_card(self) -> DatasetCard:
        """Generate a README/dataset card for this benchmark."""
        template_path = Path(__file__).parent / "benchmark_card_template.md"

        task_rows = [
            {
                "name": task.metadata.name,
                "reference": task.metadata.reference,
                "simplified_type": task.metadata.simplified_task_type,
                "description": task.metadata.description or "",
            }
            for task in self.tasks
        ]

        return cast(
            "DatasetCard",
            DatasetCard.from_template(
                card_data=DatasetCardData(tags=["mteb", "benchmark"]),
                template_path=str(template_path),
                benchmark_name=self.name,
                benchmark_description=self.description,
                tasks=task_rows,
                citation=self.citation,
            ),
        )

    def push_benchmark_card_to_hub(
        self,
        *,
        create_pr: bool = False,
    ) -> None:
        """Push a README benchmark card to the HuggingFace Hub dataset repo."""
        if self.benchmark_hf_repo is None:
            raise ValueError(
                "`benchmark_hf_repo` must be set to push a benchmark card to the hub."
            )

        if not huggingface_hub.repo_exists(self.benchmark_hf_repo, repo_type="dataset"):
            huggingface_hub.create_repo(
                self.benchmark_hf_repo,
                repo_type="dataset",
            )

        card = self._generate_benchmark_card()
        card.push_to_hub(
            self.benchmark_hf_repo,
            repo_type="dataset",
            commit_message="Add benchmark card",
            create_pr=create_pr,
        )

    def push_eval_to_hub(
        self,
        *,
        create_pr: bool = False,
    ) -> None:
        """Push `eval.yaml` to the HuggingFace Hub

        Args:
            create_pr: Whether to create the PR
        """
        eval_file_name = "eval.yaml"

        if self.benchmark_hf_repo is None:
            raise ValueError(
                "`benchmark_hf_repo` must be set to push eval config to the hub."
            )

        existing_eval_path = _get_file_on_hub(
            repo_id=self.benchmark_hf_repo,
            file_name=eval_file_name,
            repo_type="dataset",
        )

        # handle multiple tasks in one repo (e.g. BRIGHT)
        existing_eval = None
        if existing_eval_path is not None:
            with Path(existing_eval_path).open(encoding="utf-8") as f:
                existing_eval_dict = yaml.safe_load(f)
            if existing_eval_dict is not None:
                existing_eval = HFEvalMeta.model_validate(existing_eval_dict)

        benchmark_config = self._to_hf_eval_config()
        benchmark_config = (
            benchmark_config.merge(existing_eval) if existing_eval else benchmark_config
        )

        with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8") as tmp_file:
            tmp_file.write(benchmark_config.to_yaml())
            tmp_file.flush()

            huggingface_hub.upload_file(
                path_or_fileobj=tmp_file.name,
                path_in_repo=eval_file_name,
                repo_id=self.benchmark_hf_repo,
                repo_type="dataset",
                commit_message="Add eval config",
                create_pr=create_pr,
            )

    def _to_hf_eval_config(self) -> HFEvalMeta:
        return HFEvalMeta(
            name=self.name,
            description=self.description,
            tasks=[
                HFEvalTaskConfig(
                    id=self.name,
                    config=None,
                    split=None,
                )
            ],
        )

    def _get_model_score(
        self,
        model_result: ModelResult,
    ) -> dict[str, float | None]:
        """Compute aggregated scores for a single model."""
        filtered = model_result.select_tasks(self.tasks).task_results
        if len(filtered) < len(self.tasks):
            raise ValueError(
                "Some scores of benchmark are missing. Please, run model on full benchmark tasks"
            )
        return {
            "Mean(Task)": _compute_mean_task(filtered),
            "Mean(TaskType)": _compute_mean_task_type(filtered),
        }

    def get_score(
        self,
        results: BenchmarkResults,
        *,
        raise_error: bool = False,
    ) -> dict[str, dict[str, float | None]]:
        """Get aggregated scores for all models in *results*.

        The benchmark class controls how scores are aggregated — subclasses may
        override this method to customise the returned metrics.

        Args:
            results: A `BenchmarkResults` object containing the model
                results to score.
            raise_error: Weather to raise an error on missing results.

        Returns:
            A dict mapping each model name to a dict with the keys:

            - ``"Mean(Task)"``: mean score across all benchmark tasks.
            - ``"Mean(TaskType)"``: mean of per-task-type means.
            - ``"Rank"``: Borda count rank (1 = best). Each model earns
                ``n - rank`` points per task; points are summed and the model
                with the highest total is ranked 1. Matches the leaderboard.
        """
        from mteb.benchmarks._create_table import _get_borda_rank

        bench_results = results.join_revisions()
        scores: dict[str, dict[str, float | None]] = {}
        per_task_rows: dict[str, dict[str, float | None]] = {}

        for model_result in bench_results:
            per_task_rows[model_result.model_name] = {}
            filtered = model_result.select_tasks(self.tasks).task_results
            try:
                scores[model_result.model_name] = self._get_model_score(model_result)
            except ValueError:
                if raise_error:
                    raise
                logger.warning(
                    "Some task results are missing. Filling results with None"
                )
                scores[model_result.model_name] = {
                    t.metadata.name: None for t in self.tasks
                }
                continue

            per_task_rows[model_result.model_name] = {
                tr.task_name: tr.get_score() for tr in filtered
            }

        if per_task_rows:
            per_task_df = pd.DataFrame.from_dict(per_task_rows, orient="index").reindex(
                list(per_task_rows.keys())
            )
            if per_task_df.shape[1] > 0:
                borda_ranks = _get_borda_rank(per_task_df)
                for name, rank in borda_ranks.items():
                    scores[name]["Rank"] = int(rank)  # type: ignore[index]
            else:
                for name, model_scores in scores.items():
                    model_scores["Rank"] = None
        else:
            for name, model_scores in scores.items():
                model_scores["Rank"] = None

        return scores

display_on_leaderboard property

Whether the benchmark should be displayed on the leaderboard.

get_score(results, *, raise_error=False)

Get aggregated scores for all models in results.

The benchmark class controls how scores are aggregated — subclasses may override this method to customise the returned metrics.

Parameters:

Name Type Description Default
results BenchmarkResults

A BenchmarkResults object containing the model results to score.

required
raise_error bool

Weather to raise an error on missing results.

False

Returns:

Type Description
dict[str, dict[str, float | None]]

A dict mapping each model name to a dict with the keys:

dict[str, dict[str, float | None]]
  • "Mean(Task)": mean score across all benchmark tasks.
dict[str, dict[str, float | None]]
  • "Mean(TaskType)": mean of per-task-type means.
dict[str, dict[str, float | None]]
  • "Rank": Borda count rank (1 = best). Each model earns n - rank points per task; points are summed and the model with the highest total is ranked 1. Matches the leaderboard.
Source code in mteb/benchmarks/benchmark.py
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
def get_score(
    self,
    results: BenchmarkResults,
    *,
    raise_error: bool = False,
) -> dict[str, dict[str, float | None]]:
    """Get aggregated scores for all models in *results*.

    The benchmark class controls how scores are aggregated — subclasses may
    override this method to customise the returned metrics.

    Args:
        results: A `BenchmarkResults` object containing the model
            results to score.
        raise_error: Weather to raise an error on missing results.

    Returns:
        A dict mapping each model name to a dict with the keys:

        - ``"Mean(Task)"``: mean score across all benchmark tasks.
        - ``"Mean(TaskType)"``: mean of per-task-type means.
        - ``"Rank"``: Borda count rank (1 = best). Each model earns
            ``n - rank`` points per task; points are summed and the model
            with the highest total is ranked 1. Matches the leaderboard.
    """
    from mteb.benchmarks._create_table import _get_borda_rank

    bench_results = results.join_revisions()
    scores: dict[str, dict[str, float | None]] = {}
    per_task_rows: dict[str, dict[str, float | None]] = {}

    for model_result in bench_results:
        per_task_rows[model_result.model_name] = {}
        filtered = model_result.select_tasks(self.tasks).task_results
        try:
            scores[model_result.model_name] = self._get_model_score(model_result)
        except ValueError:
            if raise_error:
                raise
            logger.warning(
                "Some task results are missing. Filling results with None"
            )
            scores[model_result.model_name] = {
                t.metadata.name: None for t in self.tasks
            }
            continue

        per_task_rows[model_result.model_name] = {
            tr.task_name: tr.get_score() for tr in filtered
        }

    if per_task_rows:
        per_task_df = pd.DataFrame.from_dict(per_task_rows, orient="index").reindex(
            list(per_task_rows.keys())
        )
        if per_task_df.shape[1] > 0:
            borda_ranks = _get_borda_rank(per_task_df)
            for name, rank in borda_ranks.items():
                scores[name]["Rank"] = int(rank)  # type: ignore[index]
        else:
            for name, model_scores in scores.items():
                model_scores["Rank"] = None
    else:
        for name, model_scores in scores.items():
            model_scores["Rank"] = None

    return scores

push_benchmark_card_to_hub(*, create_pr=False)

Push a README benchmark card to the HuggingFace Hub dataset repo.

Source code in mteb/benchmarks/benchmark.py
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
def push_benchmark_card_to_hub(
    self,
    *,
    create_pr: bool = False,
) -> None:
    """Push a README benchmark card to the HuggingFace Hub dataset repo."""
    if self.benchmark_hf_repo is None:
        raise ValueError(
            "`benchmark_hf_repo` must be set to push a benchmark card to the hub."
        )

    if not huggingface_hub.repo_exists(self.benchmark_hf_repo, repo_type="dataset"):
        huggingface_hub.create_repo(
            self.benchmark_hf_repo,
            repo_type="dataset",
        )

    card = self._generate_benchmark_card()
    card.push_to_hub(
        self.benchmark_hf_repo,
        repo_type="dataset",
        commit_message="Add benchmark card",
        create_pr=create_pr,
    )

push_collection_to_hub(hf_username, collection_name=None)

Push the benchmark collection to Hugging Face Hub.

Parameters:

Name Type Description Default
hf_username str

Hugging Face username or organization name

required
collection_name str | None

Name for the collection on Hugging Face Hub. If not provided, the benchmark name will be used.

None
Source code in mteb/benchmarks/benchmark.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
def push_collection_to_hub(
    self,
    hf_username: str,
    collection_name: str | None = None,
) -> None:
    """Push the benchmark collection to Hugging Face Hub.

    Args:
        hf_username: Hugging Face username or organization name
        collection_name: Name for the collection on Hugging Face Hub. If not provided, the benchmark name will be used.
    """
    collections = huggingface_hub.list_collections(owner=hf_username)
    collection_name = collection_name or self.name
    existing_collection = None
    for collection in collections:
        if collection.title == collection_name:
            existing_collection = collection
            break

    if existing_collection is None:
        description = self.description
        if description and len(description) > 150:
            description = description[:147] + "..."
        collection = huggingface_hub.create_collection(
            title=collection_name,
            namespace=hf_username,
            # hf collections have a 150 character limit for description, so we truncate it if it's too long
            description=description if description else None,
        )
    else:
        # list collections would output only 4 items
        collection = huggingface_hub.get_collection(
            collection_slug=existing_collection.slug
        )

    existing_items = {item.item_id for item in collection.items}

    for task in self.tasks:
        tasks = (
            cast("AbsTaskAggregate", task).tasks if task.is_aggregate else [task]
        )
        for benchmark_task in tasks:
            task_path = benchmark_task.metadata.dataset["path"]
            if task_path in existing_items:
                continue
            huggingface_hub.add_collection_item(
                collection_slug=collection.slug,
                item_id=task_path,
                item_type="dataset",
            )
            existing_items.add(task_path)

push_eval_to_hub(*, create_pr=False)

Push eval.yaml to the HuggingFace Hub

Parameters:

Name Type Description Default
create_pr bool

Whether to create the PR

False
Source code in mteb/benchmarks/benchmark.py
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
def push_eval_to_hub(
    self,
    *,
    create_pr: bool = False,
) -> None:
    """Push `eval.yaml` to the HuggingFace Hub

    Args:
        create_pr: Whether to create the PR
    """
    eval_file_name = "eval.yaml"

    if self.benchmark_hf_repo is None:
        raise ValueError(
            "`benchmark_hf_repo` must be set to push eval config to the hub."
        )

    existing_eval_path = _get_file_on_hub(
        repo_id=self.benchmark_hf_repo,
        file_name=eval_file_name,
        repo_type="dataset",
    )

    # handle multiple tasks in one repo (e.g. BRIGHT)
    existing_eval = None
    if existing_eval_path is not None:
        with Path(existing_eval_path).open(encoding="utf-8") as f:
            existing_eval_dict = yaml.safe_load(f)
        if existing_eval_dict is not None:
            existing_eval = HFEvalMeta.model_validate(existing_eval_dict)

    benchmark_config = self._to_hf_eval_config()
    benchmark_config = (
        benchmark_config.merge(existing_eval) if existing_eval else benchmark_config
    )

    with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8") as tmp_file:
        tmp_file.write(benchmark_config.to_yaml())
        tmp_file.flush()

        huggingface_hub.upload_file(
            path_or_fileobj=tmp_file.name,
            path_in_repo=eval_file_name,
            repo_id=self.benchmark_hf_repo,
            repo_type="dataset",
            commit_message="Add eval config",
            create_pr=create_pr,
        )