Results¶

When a models is evaluated in MTEB it produces results. These results consist of:

TaskResult: Result for a single task
ModelResult: Result for a model on a set of tasks
BenchmarkResults: Result for a set of models on a set of tasks

In normal use these come up when running a model:

# ...
models_results = mteb.evaluate(model, tasks)
type(models_results) # mteb.results.ModelResults

task_result = models_results.task_results
type(models_results) # mteb.results.TaskResult

Results cache¶

`mteb.cache.ResultCache` ¶

Class to handle the local cache of MTEB results.

Examples:

>>> import mteb
>>> cache = mteb.ResultCache(cache_path="~/.cache/mteb")  # default
>>> cache.download_from_remote()  # download the latest results from the remote repository
>>> result = cache.load_results("task_name", "model_name")

Source code in mteb/cache/result_cache.py

class ResultCache:
    """Class to handle the local cache of MTEB results.

    Examples:
        >>> import mteb
        >>> cache = mteb.ResultCache(cache_path="~/.cache/mteb")  # default
        >>> cache.download_from_remote()  # download the latest results from the remote repository
        >>> result = cache.load_results("task_name", "model_name")
    """

    cache_path: Path

    def __init__(self, cache_path: Path | str | None = None) -> None:
        if cache_path is not None:
            self.cache_path = Path(cache_path)
        else:
            self.cache_path = self.default_cache_path
        self.cache_path.mkdir(parents=True, exist_ok=True)

    @property
    def remote_repo_path(self) -> Path:
        """Get the path to the remote repository clone.

        Returns:
            The path to the remote repository clone.
        """
        return self.cache_path / "remote"

    @property
    def has_remote(self) -> bool:
        """Check if the remote results repository exists in the cache directory.

        Returns:
            True if the remote results repository exists, False otherwise.
        """
        return self.remote_repo_path.exists()

    @property
    def remote_results_path(self) -> Path:
        """Get the path to the remote results directory.

        Returns:
            The path to the remote results directory.
        """
        return self.remote_repo_path / "results"

    def get_task_result_path(
        self,
        task_name: str,
        model_name: str | ModelMeta,
        model_revision: str | None = None,
        remote: bool = False,
        experiment_name: str | None = None,
    ) -> Path:
        """Get the path to the results of a specific task for a specific model and revision.

        Args:
            task_name: The name of the task.
            model_name: The name of the model as a valid directory name or a ModelMeta object.
            model_revision: The revision of the model. Must be specified if model_name is a string.
            remote: If True, it will return the path to the remote results repository, otherwise it will return the path to the local results repository.
            experiment_name: The name of the experiment as a valid directory name. If model_name is a ModelMeta object, its experiment_name will be used.

        Returns:
            The path to the results of the task.
        """
        results_folder = (
            self.cache_path / "results" if not remote else self.remote_results_path
        )

        if isinstance(model_name, ModelMeta):
            if model_revision is not None:
                logger.warning(
                    "model_revision and experiment_name is ignored when model_name is a ModelMeta object"
                )
            model_revision = model_name.revision
            experiment_name = model_name.experiment_name
            model_name = model_name.model_name_as_path()
        elif isinstance(model_name, str):
            model_name = model_name.replace("/", "__").replace(" ", "_")

        model_path = results_folder / model_name

        if model_revision is None:
            msg = "`model_revision` is not specified, attempting to load the latest revision. To disable this behavior, specify the 'model_revision` explicitly."
            logger.warning(msg)
            warnings.warn(msg)
            # get revs from paths
            revisions = [p for p in model_path.glob("*") if p.is_dir()]
            if not revisions:
                model_revision = "no_revision_available"
            else:
                if len(revisions) > 1:
                    logger.warning(
                        f"Multiple revisions found for model {model_name}: {revisions}. Using the latest one (according to latest edit)."
                    )
                    # sort folder by latest edit time
                    revisions.sort(key=lambda p: p.stat().st_mtime, reverse=True)
                model_revision = revisions[0].name

        if experiment_name:
            return (
                model_path
                / model_revision
                / _EXPERIMENTS_FOLDER_NAME
                / experiment_name
                / f"{task_name}.json"
            )
        return model_path / model_revision / f"{task_name}.json"

    def load_task_result(
        self,
        task_name: str,
        model_name: str | ModelMeta,
        model_revision: str | None = None,
        raise_if_not_found: bool = False,
        prioritize_remote: bool = False,
        experiment_name: str | None = None,
    ) -> TaskResult | None:
        """Load the results from the local cache directory.

        Args:
            task_name: The name of the task.
            model_name: The name of the model as a valid directory name or a ModelMeta object.
            model_revision: The revision of the model. Must be specified if model_name is a string.
            raise_if_not_found: If True, raise an error if the results are not found.
            prioritize_remote: If True, it will first try to load the results from the remote repository, if available.
            experiment_name: Optional experiment folder name (a valid directory name). If None, the default is used.

        Returns:
            The results of the task, or None if not found.
        """
        result_path = self.get_task_result_path(
            model_name=model_name,
            model_revision=model_revision,
            task_name=task_name,
            experiment_name=experiment_name,
        )

        if self.has_remote:
            remote_result_path = self.get_task_result_path(
                model_name=model_name,
                model_revision=model_revision,
                task_name=task_name,
                remote=True,
                experiment_name=experiment_name,
            )
            if remote_result_path.exists() and prioritize_remote:
                result_path = remote_result_path
            elif not result_path.exists():
                result_path = remote_result_path

        if not result_path.exists():
            msg = f"Results for {model_name} on {task_name} not found in {result_path}"
            if raise_if_not_found:
                raise FileNotFoundError(msg)
            logger.debug(msg)
            return None

        return TaskResult.from_disk(result_path)

    def save_to_cache(
        self,
        task_result: TaskResult,
        model_name: str | ModelMeta,
        model_revision: str | None = None,
        *,
        encode_kwargs: Mapping[str, Any] | None = None,
    ) -> None:
        """Save the task results to the local cache directory in the location {model_name}/{model_revision}/{task_name}.json.

        Where model_name is a path-normalized model name.
        In addition we also save a model_meta.json in the revision folder to preserve the model metadata.

        Args:
            task_result: The results of the task.
            model_name: The name of the model as a valid directory name or a ModelMeta object.
            model_revision: The revision of the model. Must be specified if model_name is a string.
            encode_kwargs: The keyword arguments passed to the model's encode method during evaluation.
        """
        result_path = self.get_task_result_path(
            model_name=model_name,
            model_revision=model_revision,
            task_name=task_result.task_name,
        )
        result_path.parent.mkdir(parents=True, exist_ok=True)
        task_result.to_disk(result_path)

        model_meta_path = result_path.parent / "model_meta.json"
        if isinstance(model_name, ModelMeta):
            meta = model_name
            with model_meta_path.open("w") as f:
                json.dump(meta.to_dict(), f, default=str, indent=4)

        version_dict = _get_package_versions()

        run_settings_list: list[dict[str, Any]] = []
        for split, split_scores in task_result.scores.items():
            for score_entry in split_scores:
                hf_subset = score_entry.get("hf_subset", "default")
                run_settings = {
                    "task": task_result.task_name,
                    "split": split,
                    "subset": hf_subset,
                    "version": version_dict,
                    "encode_kwargs": json.loads(json.dumps(encode_kwargs, default=str))
                    if encode_kwargs is not None
                    else {},
                }
                run_settings_list.append(run_settings)

        if run_settings_list:
            run_settings_path = result_path.parent / "run_settings.jsonl"
            _write_and_merge_keyed_json(run_settings_path, run_settings_list)

    @property
    def default_cache_path(self) -> Path:
        """Get the local cache directory for MTEB results.

        Returns:
            The path to the local cache directory.
        """
        default_cache_directory = Path.home() / ".cache" / "mteb"

        _cache_directory = os.environ.get("MTEB_CACHE", None)
        cache_directory = (
            Path(_cache_directory) if _cache_directory else default_cache_directory
        )
        return cache_directory

    def download_from_remote(
        self,
        remote: str = "https://github.com/embeddings-benchmark/results",
        download_latest: bool = True,
        revision: str | None = None,
    ) -> Path:
        """Downloads the latest version of the results repository from GitHub to a local cache directory. Required git to be installed.

        Args:
            remote: The URL of the results repository on GitHub.
            download_latest: If True it will download the latest version of the repository, otherwise it will only update the existing repository.
            revision: If specified, it will checkout the given revision after cloning or pulling the repository.

        Returns:
            The path to the local cache directory.
        """
        # Validate cache_path is a directory (or doesn't exist yet)
        if self.cache_path.exists() and not self.cache_path.is_dir():
            raise ValueError(
                f"Cache path '{self.cache_path}' exists but is not a directory. "
                "Please remove it or specify a different cache path."
            )

        self.cache_path.mkdir(parents=True, exist_ok=True)

        # if "results" folder already exists update it
        results_directory = self.remote_repo_path

        if results_directory.exists():
            # check repository in the directory is the same as the remote
            remote_url = subprocess.run(
                ["git", "config", "--get", "remote.origin.url"],
                check=False,
                cwd=results_directory,
                capture_output=True,
                text=True,
            ).stdout.strip()
            if remote_url != remote:
                msg = (
                    f"remote repository '{remote}' does not match the one in {results_directory},  which is '{remote_url}'."
                    + " Please remove the directory and try again."
                )
                raise ValueError(msg)

            if revision or download_latest:
                logger.info(
                    f"remote repository already exists in {results_directory}, fetching updates"
                )
                subprocess.run(
                    ["git", "fetch", "--all", "--tags"],
                    cwd=results_directory,
                    check=True,
                    text=True,
                )
            else:
                logger.debug(
                    f"Results repository already exists in {results_directory}, skipping update, "
                    f"set download_latest=True to update it"
                )

            if revision:
                logger.info(f"Checking out revision '{revision}'")
                subprocess.run(
                    ["git", "checkout", revision],
                    cwd=results_directory,
                    check=True,
                    text=True,
                )
            return results_directory

        logger.info(
            f"No results repository found in {results_directory}, cloning it from {remote}"
        )

        clone_cmd = ["git", "clone", "--depth", "1"]

        if revision:
            logger.info(f"Cloning repository at revision '{revision}'")
            clone_cmd.append(f"--revision={revision}")
        clone_cmd.extend([remote, "remote"])

        subprocess.run(
            clone_cmd,
            cwd=self.cache_path,
            check=True,
            text=True,
        )

        return results_directory

    def _download_cached_results_from_branch(
        self,
        *,
        branch: str = "cached-data",
        filename: str = "__cached_results.json.gz",
        output_path: Path | None = None,
        remote: str = "https://github.com/embeddings-benchmark/results",
        timeout: int = 60,
        max_size_mb: int = 500,
    ) -> Path:
        """Download pre-computed cached results from a specific branch.

        This is significantly faster than download_from_remote() since it downloads
        only a compressed cache file instead of cloning the entire repository.

        The method performs the following steps:
        1. Downloads a gzipped JSON file from the specified branch
        2. Validates file size and content type
        3. Decompresses the gzip content
        4. Writes the decompressed JSON to disk

        Args:
            branch: Branch name to download from (default: "cached-data")
            filename: Name of the cached results file (default: "__cached_results.json.gz")
            output_path: Where to save the file. If None, uses {cache_path}/leaderboard/__cached_results.json
            remote: Base URL of the results repository
            timeout: Request timeout in seconds (default: 60)
            max_size_mb: Maximum allowed file size in megabytes (default: 500)

        Returns:
            Path to the downloaded and decompressed cache file

        Raises:
            requests.exceptions.RequestException: On HTTP errors
            ValueError: On validation failures (size, content-type)
            gzip.BadGzipFile: If content is not valid gzip
            UnicodeDecodeError: If content cannot be decoded as UTF-8
            PermissionError: If file cannot be written due to permissions
            OSError: On other file system errors

        Examples:
            >>> import mteb
            >>> cache = mteb.ResultCache()
            >>> # Download optimized cached results
            >>> cache_file = cache._download_cached_results_from_branch()
            >>> # Use custom output path
            >>> cache_file = cache._download_cached_results_from_branch(
            ...     output_path=Path("/tmp/my_cache.json")
            ... )
        """
        if output_path is None:
            # Default to saving in {cache_path}/leaderboard/__cached_results.json
            output_path = self.cache_path / "leaderboard" / "__cached_results.json"

        # Extract repository owner and name from the remote URL
        # e.g., "https://github.com/embeddings-benchmark/results" -> "embeddings-benchmark/results"
        repo_path = remote.replace("https://github.com/", "").replace(
            "http://github.com/", ""
        )

        url = f"https://raw.githubusercontent.com/{repo_path}/{branch}/{filename}"
        logger.info(f"Downloading cached results from {url}")

        # Step 1: Download with validation
        max_size_bytes = max_size_mb * 1024 * 1024

        try:
            response = requests.get(url, timeout=timeout)
            response.raise_for_status()

            # Check if this is a Git LFS pointer file
            content_type = response.headers.get("content-type", "").lower()
            if (
                content_type == "text/plain; charset=utf-8"
                and b"git-lfs" in response.content
            ):
                # Try Git LFS media URL instead
                media_url = f"https://media.githubusercontent.com/media/{repo_path}/{branch}/{filename}"
                logger.info(f"Detected Git LFS file, trying media URL: {media_url}")
                response = requests.get(media_url, timeout=timeout)
                response.raise_for_status()
                content_type = response.headers.get("content-type", "").lower()

            # Validate content-type header
            expected_content_types = [
                "application/gzip",
                "application/octet-stream",
                "application/x-gzip",
            ]
            if content_type and not any(
                ct in content_type for ct in expected_content_types
            ):
                raise Exception(
                    f"Unexpected content-type: {content_type}. Expected one of: {expected_content_types}"
                )

            # Validate file size
            content_length = len(response.content)
            if content_length > max_size_bytes:
                raise ValueError(
                    f"Downloaded file too large: {content_length} bytes (max: {max_size_bytes})"
                )

            logger.info(
                f"HTTP request successful, content length: {content_length} bytes"
            )
            content = response.content

        except Exception as e:
            logger.error(f"Unexpected HTTP error: {type(e).__name__}: {e}")
            raise e

        # Step 2: Decompress gzip data
        logger.info("Attempting gzip decompression...")

        try:
            with gzip.open(io.BytesIO(content), "rt", encoding="utf-8") as gz_file:
                data = gz_file.read()
            logger.info(f"Decompression successful, data length: {len(data)} chars")

        except Exception as e:
            logger.error(f"Unexpected decompression error: {type(e).__name__}: {e}")
            raise e

        # Step 3: Write to disk
        logger.info(f"Attempting to write to: {output_path}")

        # Check parent directory exists and is writable
        output_path.parent.mkdir(parents=True, exist_ok=True)

        try:
            output_path.write_text(data, encoding="utf-8")
            logger.info(
                f"File write successful, size: {output_path.stat().st_size} bytes"
            )
        except Exception as e:
            logger.error(f"Unexpected file write error: {type(e).__name__}: {e}")
            raise e

        return output_path

    def clear_cache(self) -> None:
        """Clear the local cache directory."""
        if self.cache_path.exists() and self.cache_path.is_dir():
            shutil.rmtree(self.cache_path)
            logger.info(f"Cache directory {self.cache_path} cleared.")
        else:
            msg = f"Cache directory `{self.cache_path}` does not exist."
            logger.warning(msg)
            warnings.warn(msg)

    def _load_from_cache(
        self,
        cache_filename: str = "__cached_results.json",
        rebuild: bool = False,
    ) -> BenchmarkResults:
        """Load benchmark results using the best available strategy.

        Args:
            cache_filename: Name of the cache file. The full path will be constructed as
                {cache_path}/leaderboard/{cache_filename}.
            rebuild: If True, force a full rebuild from the results repository, bypassing any
                     pre-computed JSON cache.

        Strategy:
            1. If rebuild=False and local cache exists at cache_path → load and return
            2. If rebuild=False, try downloading pre-computed cache from 'cached-data' branch
               → save to cache_path and return
            3. Fallback (or if rebuild=True): clone the full results repository, build from
               individual model files, call results.to_disk(cache_path), and return.

        Returns:
            BenchmarkResults ready for leaderboard display
        """
        cache_path = self.cache_path / "leaderboard" / cache_filename

        # If rebuild=True, skip directly to full repository rebuild
        if rebuild:
            logger.info(
                "Rebuild requested, forcing full repository clone and rebuild..."
            )
            return self._rebuild_from_full_repository(cache_path)

        # Strategy 1: Try loading from existing local quick cache
        if cache_path.exists():
            logger.info(f"Loading existing quick cache from {cache_path}")
            try:
                return BenchmarkResults.from_disk(cache_path)
            except Exception as e:
                logger.warning(
                    f"Failed to load quick cache: {e}. Trying other strategies..."
                )

        # Strategy 2: Try downloading from cached-data branch
        try:
            logger.info(
                "Attempting to download pre-computed cache from cached-data branch..."
            )
            downloaded_path = self._download_cached_results_from_branch(
                output_path=cache_path
            )
            logger.info(f"Downloaded cache to {downloaded_path}")
            return BenchmarkResults.from_disk(downloaded_path)
        except Exception as e:
            logger.warning(f"Failed to download from cached-data branch: {e}")

        # Strategy 3: Fallback to full repository clone
        logger.info("Falling back to full repository clone and rebuild...")
        return self._rebuild_from_full_repository(cache_path)

    def _rebuild_from_full_repository(self, quick_cache_path: Path) -> BenchmarkResults:
        """Clone/pull the full results repository and build BenchmarkResults from individual files.

        This method performs a full rebuild by:
        1. Downloading or updating the full results repository
        2. Loading results from all individual model files
        3. Saving the aggregated results to the quick cache path
        4. Returning the BenchmarkResults object

        Args:
            quick_cache_path: Path where the rebuilt cache should be saved

        Returns:
            BenchmarkResults built from the full repository
        """
        # Download or update the full repository
        self.download_from_remote()

        all_model_names = [
            model_meta.name
            for model_meta in get_model_metas()
            if model_meta.name is not None
        ]

        all_results = self.load_results(
            models=all_model_names,
            only_main_score=True,
            require_model_meta=False,
            include_remote=True,
        )

        # Save to disk for future use
        logger.info(f"Saving rebuilt cache to {quick_cache_path}")
        all_results.to_disk(quick_cache_path)

        return all_results

    def __repr__(self) -> str:
        return f"ResultCache(cache_path={self.cache_path})"

    def get_cache_paths(
        self,
        models: Sequence[str] | Iterable[ModelMeta] | None = None,
        tasks: Sequence[str] | Iterable[AbsTask] | None = None,
        require_model_meta: bool = True,
        include_remote: bool = True,
        load_experiments: LoadExperimentEnum | str = LoadExperimentEnum.NO_EXPERIMENTS,
    ) -> list[Path]:
        """Get all paths to result JSON files in the cache directory.

        These paths can then be used to fetch task results, like:
        ```python
        for path in paths:
            task_result = TaskResult.from_disk(path)
        ```

        Args:
            models: A list of model names or ModelMeta objects to filter the paths.
            tasks: A list of task names to filter the paths.
            require_model_meta: If True, only return paths that have a model_meta.json file.
            include_remote: If True, include remote results in the returned paths.
            load_experiments: If True, include experiments in the returned paths.

        Returns:
            A list of paths in the cache directory.

        Examples:
            >>> import mteb
            >>> cache = mteb.ResultCache()
            >>>
            >>> # Get all cache paths
            >>> paths = cache.get_cache_paths()
            >>>
            >>> # Get all cache paths for a specific task
            >>> paths = cache.get_cache_paths(tasks=["STS12"])
            >>>
            >>> # Get all cache paths for a specific model
            >>> paths = cache.get_cache_paths(models=["sentence-transformers/all-MiniLM-L6-v2"])
            >>>
            >>> # Get all cache paths for a specific model and revision
            >>> model_meta = mteb.get_model_meta("sentence-transformers/all-MiniLM-L6-v2")
            >>> paths = cache.get_cache_paths(models=[model_meta])
        """
        if isinstance(load_experiments, str):
            load_experiments = LoadExperimentEnum.from_str(load_experiments)

        def _cache_paths(base_path: Path) -> list[Path]:
            return [
                p
                for p in base_path.glob("*/*/*.json")  # model/revision/task.json
                if p.name != "model_meta.json"
            ]

        def _experiments_paths(base_path: Path) -> list[Path]:
            return [
                p
                for p in base_path.glob(f"*/*/{_EXPERIMENTS_FOLDER_NAME}/*/*.json")
                if p.name != "model_meta.json"
            ]

        def _get_paths(base_path: Path, experiments: LoadExperimentEnum) -> list[Path]:
            paths = _cache_paths(base_path)
            if not experiments == LoadExperimentEnum.NO_EXPERIMENTS:
                paths += _experiments_paths(base_path)
            return paths

        results_path = self.cache_path / "results"
        remote_path = self.remote_results_path

        cache_paths = _get_paths(results_path, load_experiments)

        if include_remote:
            cache_paths += _get_paths(remote_path, load_experiments)

        cache_paths = self._filter_paths_by_model_and_revision(
            cache_paths,
            models=models,
            load_experiments=load_experiments,
        )
        cache_paths = self._filter_paths_by_task(cache_paths, tasks=tasks)

        if require_model_meta:
            cache_paths = [
                p for p in cache_paths if (p.parent / "model_meta.json").exists()
            ]
        return cache_paths

    def get_models(
        self,
        tasks: Sequence[str] | None = None,
        require_model_meta: bool = True,
        include_remote: bool = True,
    ) -> list[tuple[ModelName, Revision]]:
        """Get all models in the cache directory.

        Args:
            tasks: A list of task names to filter the models.
            require_model_meta: If True, only return models that have a model_meta.json file.
            include_remote: If True, include remote results in the returned models.

        Returns:
            A list of tuples containing the model name and revision.
        """
        cache_paths = self.get_cache_paths(
            tasks=tasks,
            require_model_meta=require_model_meta,
            include_remote=include_remote,
        )
        models = [(p.parent.parent.name, p.parent.name) for p in cache_paths]
        return list(set(models))

    def get_task_names(
        self,
        models: list[str] | list[ModelMeta] | None = None,
        require_model_meta: bool = True,
        include_remote: bool = True,
    ) -> list[str]:
        """Get all task names in the cache directory.

        Args:
            models: A list of model names or ModelMeta objects to filter the task names.
            require_model_meta: If True, only return task names that have a model_meta.json file
            include_remote: If True, include remote results in the returned task names.

        Returns:
            A list of task names in the cache directory.
        """
        cache_paths = self.get_cache_paths(
            models=models,
            require_model_meta=require_model_meta,
            include_remote=include_remote,
        )
        tasks = [p.stem for p in cache_paths]
        return list(set(tasks))

    @staticmethod
    def _get_model_name_and_revision_from_path(
        revision_path: Path,
    ) -> tuple[ModelName, Revision, str | None]:
        """Get model name, revision and experiment name from the given path.

        Args:
            revision_path: The path to the revision folder, which should contain a model_meta.json file. If the file is not found, it will attempt to extract the model name and revision from the path.

        Returns:
            A tuple containing the model name, revision and experiment name (if available).

        """
        model_meta = revision_path / "model_meta.json"
        model_path = revision_path.parent

        if not model_meta.exists():
            logger.debug(
                f"model_meta.json not found in {revision_path}, extracting model_name and revision from the path"
            )
            if _EXPERIMENTS_FOLDER_NAME in revision_path.parts:
                logger.debug(
                    f"Path {revision_path} contains an experiment folder, extracting model_name and revision accordingly"
                )
                experiment_name = revision_path.name
                revision = revision_path.parent.parent.name
                model_name = revision_path.parent.parent.parent.name.replace("__", "/")
                return model_name, revision, experiment_name
            model_name = model_path.name.replace("__", "/")
            revision = revision_path.name
            return model_name, revision, None
        with model_meta.open("r") as f:
            model_meta_json = json.load(f)
        model_name = model_meta_json["name"]
        revision = model_meta_json["revision"]
        experiment_kwargs = model_meta_json.get("experiment_kwargs", None)
        experiment_name_ = _serialize_experiment_kwargs_to_name(experiment_kwargs)
        return model_name, revision, experiment_name_

    @staticmethod
    def _filter_paths_by_model_and_revision(
        paths: list[Path],
        models: Sequence[str] | Iterable[ModelMeta] | None = None,
        load_experiments: LoadExperimentEnum | None = None,
    ) -> list[Path]:
        """Filter a list of paths by model name and optional revision.

        Returns:
            A list of paths that match the specified model names and revisions.
        """
        if not models:
            return paths

        first_model = next(iter(models))
        if isinstance(first_model, ModelMeta):
            models = cast("Iterable[ModelMeta]", models)
            name_and_revision = {
                (
                    m.model_name_as_path(),
                    m.revision or "no_revision_available",
                    m.experiment_name
                    if load_experiments is LoadExperimentEnum.MATCH_KWARGS
                    else None,
                )
                for m in models
            }
            model_name_and_revision = list()
            for path in paths:
                if _EXPERIMENTS_FOLDER_NAME in path.parts:
                    revision = path.parent.parent.parent.name
                    model_name = path.parent.parent.parent.parent.name
                    experiment_name = (
                        path.parent.name
                        if load_experiments is LoadExperimentEnum.MATCH_KWARGS
                        else None
                    )
                else:
                    revision = path.parent.name
                    model_name = path.parent.parent.name
                    experiment_name = None
                model_name_and_revision.append((model_name, revision, experiment_name))
            return [
                p
                for model_revision, p in zip(model_name_and_revision, paths)
                if model_revision in name_and_revision
            ]

        str_models = cast("Sequence[str]", models)
        model_names = {m.replace("/", "__").replace(" ", "_") for m in str_models}
        filtered_paths = []
        for p in paths:
            if _EXPERIMENTS_FOLDER_NAME in p.parts:
                model_name = p.parent.parent.parent.parent.name
            else:
                model_name = p.parent.parent.name
            if model_name in model_names:
                filtered_paths.append(p)
        return filtered_paths

    @staticmethod
    def _filter_paths_by_task(
        paths: list[Path],
        tasks: Sequence[str] | Iterable[AbsTask] | None = None,
    ) -> list[Path]:
        if tasks is not None:
            task_names = set()

            for task in tasks:
                if isinstance(task, AbsTask):
                    task_names.add(task.metadata.name)
                else:
                    task_names.add(task)

            paths = [p for p in paths if p.stem in task_names]
        return paths

    def _load_model_meta_from_cache(
        self,
        model_name: str,
        revision: str,
    ) -> ModelMeta | None:
        """Load ModelMeta from cache directory.

        Args:
            model_name: The model name.
            revision: The model revision.

        Returns:
            ModelMeta object if found, None otherwise.
        """
        model_name_path = model_name.replace("/", "__").replace(" ", "_")
        meta_file = (
            self.cache_path / "results" / model_name_path / revision / "model_meta.json"
        )

        if not meta_file.exists():
            logger.warning(
                f"model_meta.json not found for {model_name} (revision: {revision})"
            )
            return None

        try:
            with meta_file.open("r") as f:
                meta_dict = f.read()
            return ModelMeta.model_validate_json(meta_dict)
        except Exception as e:
            logger.warning(f"Failed to load ModelMeta from {meta_file}: {e}")
            return None

    def _normalize_models(
        self,
        models: Sequence[str] | Sequence[ModelMeta] | str | ModelMeta | None = None,
    ) -> list[ModelMeta]:
        """Normalize model input to list of ModelMeta objects.

        Args:
            models: Model(s) to normalize. Can either a list of string or ModelMeta objects.
            If None it will get all models from local cache.

        Returns:
            List of ModelMeta objects.

        Raises:
            ValueError: If no models found or invalid input.
        """
        if models is None:
            local_models = self.get_models(
                require_model_meta=True, include_remote=False
            )
            if not local_models:
                raise ValueError(
                    "No models found in local cache. Please evaluate models first."
                )
            normalized = []
            for model_name, revision in local_models:
                model_meta = self._load_model_meta_from_cache(model_name, revision)
                if model_meta:
                    normalized.append(model_meta)
            return normalized

        if isinstance(models, (str, ModelMeta)):
            models_to_process: list[str | ModelMeta] = [models]
        else:
            models_to_process = cast("list[str | ModelMeta]", models)

        normalized = []
        for model in models_to_process:
            if isinstance(model, ModelMeta):
                if model.revision is None or model.name is None:
                    raise ValueError(
                        f"ModelMeta {model.name} has no revision or name. "
                        "Cannot submit results without both."
                    )
                normalized.append(model)
            elif isinstance(model, str):
                local_models = self.get_models(
                    require_model_meta=False, include_remote=False
                )
                matching = [
                    (name, rev)
                    for name, rev in local_models
                    if name == model.replace("/", "__")
                ]
                if not matching:
                    raise ValueError(
                        f"Model '{model}' not found in local cache. "
                        "Please evaluate it first."
                    )
                for model_name, revision in matching:
                    model_meta = self._load_model_meta_from_cache(model_name, revision)
                    if model_meta:
                        normalized.append(model_meta)
            else:
                raise TypeError(f"Invalid model type: {type(model)}")

        if not normalized:
            raise ValueError("No valid models to submit.")

        return normalized

    def _get_unsubmitted_results(
        self,
        models: list[ModelMeta],
    ) -> dict[ModelMeta, list[Path]]:
        """Find unsubmitted results.

        Args:
            models: List of ModelMeta objects.

        Returns:
            Dict mapping ModelMeta to list of unsubmitted result file paths.
        """
        return {
            model: self.get_cache_paths(
                models=[model],
                require_model_meta=False,
                include_remote=False,
                load_experiments=LoadExperimentEnum.MATCH_KWARGS,
            )
            for model in models
        }

    def submit_results(
        self,
        models: Sequence[str] | Sequence[ModelMeta] | str | ModelMeta | None = None,
        *,
        create_pr: bool = False,
    ) -> SubmitResultsResponse:
        """Create a commit of the results to the official MTEB results repository (https://github.com/embeddings-benchmark/results).

        It does this by downloading the remote (if not downloaded already) and
        submitting the diff from the local result to the repository. Requires PyGithub
        to be installed if `create_pr=True`.

        Args:
            models: Model(s) whose results should be submitted. Can either a list of string or ModelMeta objects.
                If None it will get all models from local cache.
            create_pr: If True, create a PR directly to the remote. If False, prints
                  instructions for manual submission.

        Returns:
            Dictionary containing submission metadata:
                - status: "ready_for_submission" or "pr_created"
                - models_submitted: list of (model_name, revision) tuples
                - result_count: number of result files submitted
                - pr_url: URL to created PR (only if create_pr=True)
                - pr_number: PR number (only if create_pr=True)
                - fork_url: URL to user's fork (only if create_pr=True)

        Raises:
            ValueError: If no models found or invalid input.
            RuntimeError: If git operations fail.
            ImportError: If create_pr=True and PyGithub is not installed.
            GithubException: If GitHub API operations fail.

        Examples:
            >>> import mteb
            >>> cache = mteb.ResultCache()
            >>> model_meta = mteb.get_model_meta(...)
            >>> tasks = mteb.get_tasks(...)
            >>> results = mteb.evaluate(model_meta, tasks, cache=cache)
            >>>
            >>> # Manual submission (step-by-step)
            >>> submission = cache.submit_results(model_meta, create_pr=False)
            >>> # Follow printed instructions
            >>>
            >>> # Automated submission
            >>> submission = cache.submit_results(model_meta, create_pr=True)
            >>> print(f"PR created: {submission['pr_url']}")
        """
        # Always create a new branch to keep the original branch clean
        branch_name = f"mteb-results-{int(datetime.now().timestamp())}"
        normalized_models = self._normalize_models(models)

        try:
            self.download_from_remote()
            unsubmitted = self._get_unsubmitted_results(normalized_models)

            if not unsubmitted:
                logger.warning("No unsubmitted results found.")
                return SubmitResultsResponse(
                    status="no_changes",
                    models_submitted=[(m.name, m.revision) for m in normalized_models],
                    result_count=0,
                )

            remote_path = self.remote_repo_path
            check_uncommitted_changes(remote_path)
            check_detached_head(remote_path)
            logger.info("Pre-flight checks passed.")

            # Capture original branch before making any changes
            original_branch = get_current_branch(remote_path)

        except RuntimeError as e:
            logger.error(f"Setup error during submit_results: {e}")
            raise
        except Exception as e:
            logger.error(f"Error during submit_results setup: {e}")
            raise

        actions: list[ReversibleAction] = [
            CreateBranchAction(remote_path, branch_name, original_branch),
            CopyResultsAction(unsubmitted, self.remote_results_path),
        ]

        commit_message, result_count = _build_commit_message(
            normalized_models, unsubmitted
        )

        actions.append(CommitAction(remote_path, commit_message))

        workflow = ReversibleWorkflow(steps=actions)
        workflow.run()

        if not create_pr:
            # For manual submission, stay on the submission branch so the commit is accessible
            # The user will push this branch to their fork and create a PR
            message = _build_manual_submission_message(
                remote_path, result_count, len(normalized_models), branch_name
            )
            logger.info("%s", message)

            return SubmitResultsResponse(
                status="ready_for_submission",
                models_submitted=[(m.name, m.revision) for m in normalized_models],
                result_count=result_count,
                path=str(remote_path),
            )

        pr_body = _prepare_pr_body(normalized_models, unsubmitted)
        return handle_pr_creation_with_cleanup(
            remote_repo_path=remote_path,
            original_branch=original_branch,
            branch_name=branch_name,
            models=normalized_models,
            result_count=result_count,
            pr_body=pr_body,
        )

    def load_results(
        self,
        models: Sequence[str] | Iterable[ModelMeta] | None = None,
        tasks: Sequence[str]
        | Iterable[AbsTask]
        | Benchmark
        | Sequence[Benchmark]
        | str
        | None = None,
        *,
        require_model_meta: bool = True,
        include_remote: bool = True,
        validate_and_filter: bool = False,
        only_main_score: bool = False,
        load_experiments: LoadExperimentEnum | str = LoadExperimentEnum.MATCH_KWARGS,
        experiment_kwargs: Mapping[str, Any] | list[Mapping[str, Any]] | None = None,
    ) -> BenchmarkResults:
        """Loads the results from the cache directory and returns a BenchmarkResults object.

        Args:
            models: A list of model names to load the results for. If None it will load the results for all models.
            tasks: A list of task names to load the results for. If str is passed, then benchmark will be loaded.
                If Benchmark is passed, then all tasks in the benchmark will be loaded.
                If None it will load the results for all tasks.
            require_model_meta: If True it will ignore results that do not have a model_meta.json file. If false it attempt to
                extract the model name and revision from the path.
            include_remote: If True, it will include results from the remote repository.
            validate_and_filter: If True it will validate that the results object for the task contains the correct splits and filter out
                splits from the results object that are not default in the task metadata.
            only_main_score: If True, only the main score will be loaded.
            load_experiments: If True, it will also load results from experiment folders.
            experiment_kwargs: If specified, it will only load results from experiments with the specified kwargs. Only used if load_experiments is True.

        Returns:
            A BenchmarkResults object containing the results for the specified models and tasks.

        Examples:
            >>> import mteb
            >>> cache = mteb.ResultCache()
            >>>
            >>> # Load results for specific models and tasks
            >>> results = cache.load_results(
            ...     models=["sentence-transformers/all-MiniLM-L6-v2"],
            ...     tasks=["STS12"],
            ...     require_model_meta=True,
            ... )
        """
        if isinstance(tasks, str):
            tasks = get_benchmark(tasks)

        benchmarks: Sequence[Benchmark] | Benchmark | None = None
        if isinstance(tasks, Sequence) and isinstance(tasks[0], Benchmark):
            benchmarks = tasks  # type: ignore[assignment]
            tasks = [sub for task in tasks for sub in cast("Benchmark", task).tasks]

        if isinstance(tasks, Benchmark):
            benchmarks = tasks

        if isinstance(load_experiments, str):
            load_experiments = LoadExperimentEnum.from_str(load_experiments)

        if (
            load_experiments is not LoadExperimentEnum.MATCH_KWARGS
            and experiment_kwargs is not None
        ):
            warnings.warn(
                "experiment_kwargs is specified but load_experiments is not set to MATCH_KWARGS."
                "No results will be loaded."
            )

        models_as_model_meta = models is not None and isinstance(
            next(iter(models)), ModelMeta
        )

        paths = self.get_cache_paths(
            models=models,
            tasks=tasks,  # type: ignore[arg-type]
            require_model_meta=require_model_meta,
            include_remote=include_remote,
            load_experiments=load_experiments,
        )
        models_results = defaultdict(list)

        task_names: dict[str, AbsTask | None] = {}
        if tasks is not None:
            for task in tasks:
                if isinstance(task, AbsTask):
                    task_names[task.metadata.name] = task
                else:
                    task_names[cast("str", task)] = None

        experiment_names = set()
        if isinstance(experiment_kwargs, Mapping):
            experiment_kwargs = [experiment_kwargs]
        if isinstance(experiment_kwargs, list):
            experiment_names = {
                _serialize_experiment_kwargs_to_name(params)
                for params in experiment_kwargs
            }
        for path in paths:
            task_result = TaskResult.from_disk(path)

            if only_main_score:
                task_result = task_result.only_main_score()
            model_name, revision, experiment_name = (
                self._get_model_name_and_revision_from_path(path.parent)
            )

            if validate_and_filter:
                task_instance = task_names[task_result.task_name]
                try:
                    task_result = task_result.validate_and_filter_scores(
                        task=task_instance
                    )
                except ValidationError as e:
                    logger.info(
                        f"Validation failed for {task_result.task_name} in {model_name} {revision}: {e}"
                    )
                    continue

            if len(experiment_names) > 0 and experiment_name not in experiment_names:
                logger.debug(
                    f"Skipping experiment {experiment_name} as it is not in the specified experiment names"
                )
                continue

            if (
                load_experiments is LoadExperimentEnum.MATCH_KWARGS
                and not models_as_model_meta  # for models meta path are prefiltered
                and len(experiment_names) == 0
                and experiment_name is not None
            ):
                continue

            models_results[(model_name, revision, experiment_name)].append(task_result)

        # create BenchmarkResults object
        models_results_object = [
            ModelResult(
                model_name=model_name,
                model_revision=revision,
                task_results=task_results,
                experiment_name=experiment_name,
            )
            for (
                model_name,
                revision,
                experiment_name,
            ), task_results in models_results.items()
        ]

        return BenchmarkResults(
            model_results=models_results_object,
            benchmark=benchmarks,
        )

`default_cache_path` `property` ¶

Get the local cache directory for MTEB results.

Returns:

Type	Description
`Path`	The path to the local cache directory.

`has_remote` `property` ¶

Check if the remote results repository exists in the cache directory.

Returns:

Type	Description
`bool`	True if the remote results repository exists, False otherwise.

`remote_repo_path` `property` ¶

Get the path to the remote repository clone.

Returns:

Type	Description
`Path`	The path to the remote repository clone.

`remote_results_path` `property` ¶

Get the path to the remote results directory.

Returns:

Type	Description
`Path`	The path to the remote results directory.

`clear_cache()` ¶

Clear the local cache directory.

Source code in mteb/cache/result_cache.py

def clear_cache(self) -> None:
    """Clear the local cache directory."""
    if self.cache_path.exists() and self.cache_path.is_dir():
        shutil.rmtree(self.cache_path)
        logger.info(f"Cache directory {self.cache_path} cleared.")
    else:
        msg = f"Cache directory `{self.cache_path}` does not exist."
        logger.warning(msg)
        warnings.warn(msg)

`download_from_remote(remote='https://github.com/embeddings-benchmark/results', download_latest=True, revision=None)` ¶

Downloads the latest version of the results repository from GitHub to a local cache directory. Required git to be installed.

Parameters:

Name	Type	Description	Default
`remote`	`str`	The URL of the results repository on GitHub.	`'https://github.com/embeddings-benchmark/results'`
`download_latest`	`bool`	If True it will download the latest version of the repository, otherwise it will only update the existing repository.	`True`
`revision`	`str \| None`	If specified, it will checkout the given revision after cloning or pulling the repository.	`None`

Returns:

Type	Description
`Path`	The path to the local cache directory.

Source code in mteb/cache/result_cache.py

def download_from_remote(
    self,
    remote: str = "https://github.com/embeddings-benchmark/results",
    download_latest: bool = True,
    revision: str | None = None,
) -> Path:
    """Downloads the latest version of the results repository from GitHub to a local cache directory. Required git to be installed.

    Args:
        remote: The URL of the results repository on GitHub.
        download_latest: If True it will download the latest version of the repository, otherwise it will only update the existing repository.
        revision: If specified, it will checkout the given revision after cloning or pulling the repository.

    Returns:
        The path to the local cache directory.
    """
    # Validate cache_path is a directory (or doesn't exist yet)
    if self.cache_path.exists() and not self.cache_path.is_dir():
        raise ValueError(
            f"Cache path '{self.cache_path}' exists but is not a directory. "
            "Please remove it or specify a different cache path."
        )

    self.cache_path.mkdir(parents=True, exist_ok=True)

    # if "results" folder already exists update it
    results_directory = self.remote_repo_path

    if results_directory.exists():
        # check repository in the directory is the same as the remote
        remote_url = subprocess.run(
            ["git", "config", "--get", "remote.origin.url"],
            check=False,
            cwd=results_directory,
            capture_output=True,
            text=True,
        ).stdout.strip()
        if remote_url != remote:
            msg = (
                f"remote repository '{remote}' does not match the one in {results_directory},  which is '{remote_url}'."
                + " Please remove the directory and try again."
            )
            raise ValueError(msg)

        if revision or download_latest:
            logger.info(
                f"remote repository already exists in {results_directory}, fetching updates"
            )
            subprocess.run(
                ["git", "fetch", "--all", "--tags"],
                cwd=results_directory,
                check=True,
                text=True,
            )
        else:
            logger.debug(
                f"Results repository already exists in {results_directory}, skipping update, "
                f"set download_latest=True to update it"
            )

        if revision:
            logger.info(f"Checking out revision '{revision}'")
            subprocess.run(
                ["git", "checkout", revision],
                cwd=results_directory,
                check=True,
                text=True,
            )
        return results_directory

    logger.info(
        f"No results repository found in {results_directory}, cloning it from {remote}"
    )

    clone_cmd = ["git", "clone", "--depth", "1"]

    if revision:
        logger.info(f"Cloning repository at revision '{revision}'")
        clone_cmd.append(f"--revision={revision}")
    clone_cmd.extend([remote, "remote"])

    subprocess.run(
        clone_cmd,
        cwd=self.cache_path,
        check=True,
        text=True,
    )

    return results_directory

`get_cache_paths(models=None, tasks=None, require_model_meta=True, include_remote=True, load_experiments=LoadExperimentEnum.NO_EXPERIMENTS)` ¶

Get all paths to result JSON files in the cache directory.

These paths can then be used to fetch task results, like:

for path in paths:
    task_result = TaskResult.from_disk(path)

Parameters:

Name	Type	Description	Default
`models`	`Sequence[str] \| Iterable[ModelMeta] \| None`	A list of model names or ModelMeta objects to filter the paths.	`None`
`tasks`	`Sequence[str] \| Iterable[AbsTask] \| None`	A list of task names to filter the paths.	`None`
`require_model_meta`	`bool`	If True, only return paths that have a model_meta.json file.	`True`
`include_remote`	`bool`	If True, include remote results in the returned paths.	`True`
`load_experiments`	`LoadExperimentEnum \| str`	If True, include experiments in the returned paths.	`NO_EXPERIMENTS`

Returns:

Type	Description
`list[Path]`	A list of paths in the cache directory.

Examples:

>>> import mteb
>>> cache = mteb.ResultCache()
>>>
>>> # Get all cache paths
>>> paths = cache.get_cache_paths()
>>>
>>> # Get all cache paths for a specific task
>>> paths = cache.get_cache_paths(tasks=["STS12"])
>>>
>>> # Get all cache paths for a specific model
>>> paths = cache.get_cache_paths(models=["sentence-transformers/all-MiniLM-L6-v2"])
>>>
>>> # Get all cache paths for a specific model and revision
>>> model_meta = mteb.get_model_meta("sentence-transformers/all-MiniLM-L6-v2")
>>> paths = cache.get_cache_paths(models=[model_meta])

Source code in mteb/cache/result_cache.py

def get_cache_paths(
    self,
    models: Sequence[str] | Iterable[ModelMeta] | None = None,
    tasks: Sequence[str] | Iterable[AbsTask] | None = None,
    require_model_meta: bool = True,
    include_remote: bool = True,
    load_experiments: LoadExperimentEnum | str = LoadExperimentEnum.NO_EXPERIMENTS,
) -> list[Path]:
    """Get all paths to result JSON files in the cache directory.

    These paths can then be used to fetch task results, like:
    ```python
    for path in paths:
        task_result = TaskResult.from_disk(path)
    ```

    Args:
        models: A list of model names or ModelMeta objects to filter the paths.
        tasks: A list of task names to filter the paths.
        require_model_meta: If True, only return paths that have a model_meta.json file.
        include_remote: If True, include remote results in the returned paths.
        load_experiments: If True, include experiments in the returned paths.

    Returns:
        A list of paths in the cache directory.

    Examples:
        >>> import mteb
        >>> cache = mteb.ResultCache()
        >>>
        >>> # Get all cache paths
        >>> paths = cache.get_cache_paths()
        >>>
        >>> # Get all cache paths for a specific task
        >>> paths = cache.get_cache_paths(tasks=["STS12"])
        >>>
        >>> # Get all cache paths for a specific model
        >>> paths = cache.get_cache_paths(models=["sentence-transformers/all-MiniLM-L6-v2"])
        >>>
        >>> # Get all cache paths for a specific model and revision
        >>> model_meta = mteb.get_model_meta("sentence-transformers/all-MiniLM-L6-v2")
        >>> paths = cache.get_cache_paths(models=[model_meta])
    """
    if isinstance(load_experiments, str):
        load_experiments = LoadExperimentEnum.from_str(load_experiments)

    def _cache_paths(base_path: Path) -> list[Path]:
        return [
            p
            for p in base_path.glob("*/*/*.json")  # model/revision/task.json
            if p.name != "model_meta.json"
        ]

    def _experiments_paths(base_path: Path) -> list[Path]:
        return [
            p
            for p in base_path.glob(f"*/*/{_EXPERIMENTS_FOLDER_NAME}/*/*.json")
            if p.name != "model_meta.json"
        ]

    def _get_paths(base_path: Path, experiments: LoadExperimentEnum) -> list[Path]:
        paths = _cache_paths(base_path)
        if not experiments == LoadExperimentEnum.NO_EXPERIMENTS:
            paths += _experiments_paths(base_path)
        return paths

    results_path = self.cache_path / "results"
    remote_path = self.remote_results_path

    cache_paths = _get_paths(results_path, load_experiments)

    if include_remote:
        cache_paths += _get_paths(remote_path, load_experiments)

    cache_paths = self._filter_paths_by_model_and_revision(
        cache_paths,
        models=models,
        load_experiments=load_experiments,
    )
    cache_paths = self._filter_paths_by_task(cache_paths, tasks=tasks)

    if require_model_meta:
        cache_paths = [
            p for p in cache_paths if (p.parent / "model_meta.json").exists()
        ]
    return cache_paths

`get_models(tasks=None, require_model_meta=True, include_remote=True)` ¶

Get all models in the cache directory.

Parameters:

Name	Type	Description	Default
`tasks`	`Sequence[str] \| None`	A list of task names to filter the models.	`None`
`require_model_meta`	`bool`	If True, only return models that have a model_meta.json file.	`True`
`include_remote`	`bool`	If True, include remote results in the returned models.	`True`

Returns:

Type	Description
`list[tuple[ModelName, Revision]]`	A list of tuples containing the model name and revision.

Source code in mteb/cache/result_cache.py

def get_models(
    self,
    tasks: Sequence[str] | None = None,
    require_model_meta: bool = True,
    include_remote: bool = True,
) -> list[tuple[ModelName, Revision]]:
    """Get all models in the cache directory.

    Args:
        tasks: A list of task names to filter the models.
        require_model_meta: If True, only return models that have a model_meta.json file.
        include_remote: If True, include remote results in the returned models.

    Returns:
        A list of tuples containing the model name and revision.
    """
    cache_paths = self.get_cache_paths(
        tasks=tasks,
        require_model_meta=require_model_meta,
        include_remote=include_remote,
    )
    models = [(p.parent.parent.name, p.parent.name) for p in cache_paths]
    return list(set(models))

`get_task_names(models=None, require_model_meta=True, include_remote=True)` ¶

Get all task names in the cache directory.

Parameters:

Name	Type	Description	Default
`models`	`list[str] \| list[ModelMeta] \| None`	A list of model names or ModelMeta objects to filter the task names.	`None`
`require_model_meta`	`bool`	If True, only return task names that have a model_meta.json file	`True`
`include_remote`	`bool`	If True, include remote results in the returned task names.	`True`

Returns:

Type	Description
`list[str]`	A list of task names in the cache directory.

Source code in mteb/cache/result_cache.py

def get_task_names(
    self,
    models: list[str] | list[ModelMeta] | None = None,
    require_model_meta: bool = True,
    include_remote: bool = True,
) -> list[str]:
    """Get all task names in the cache directory.

    Args:
        models: A list of model names or ModelMeta objects to filter the task names.
        require_model_meta: If True, only return task names that have a model_meta.json file
        include_remote: If True, include remote results in the returned task names.

    Returns:
        A list of task names in the cache directory.
    """
    cache_paths = self.get_cache_paths(
        models=models,
        require_model_meta=require_model_meta,
        include_remote=include_remote,
    )
    tasks = [p.stem for p in cache_paths]
    return list(set(tasks))

`get_task_result_path(task_name, model_name, model_revision=None, remote=False, experiment_name=None)` ¶

Get the path to the results of a specific task for a specific model and revision.

Parameters:

Name	Type	Description	Default
`task_name`	`str`	The name of the task.	required
`model_name`	`str \| ModelMeta`	The name of the model as a valid directory name or a ModelMeta object.	required
`model_revision`	`str \| None`	The revision of the model. Must be specified if model_name is a string.	`None`
`remote`	`bool`	If True, it will return the path to the remote results repository, otherwise it will return the path to the local results repository.	`False`
`experiment_name`	`str \| None`	The name of the experiment as a valid directory name. If model_name is a ModelMeta object, its experiment_name will be used.	`None`

Returns:

Type	Description
`Path`	The path to the results of the task.

Source code in mteb/cache/result_cache.py

def get_task_result_path(
    self,
    task_name: str,
    model_name: str | ModelMeta,
    model_revision: str | None = None,
    remote: bool = False,
    experiment_name: str | None = None,
) -> Path:
    """Get the path to the results of a specific task for a specific model and revision.

    Args:
        task_name: The name of the task.
        model_name: The name of the model as a valid directory name or a ModelMeta object.
        model_revision: The revision of the model. Must be specified if model_name is a string.
        remote: If True, it will return the path to the remote results repository, otherwise it will return the path to the local results repository.
        experiment_name: The name of the experiment as a valid directory name. If model_name is a ModelMeta object, its experiment_name will be used.

    Returns:
        The path to the results of the task.
    """
    results_folder = (
        self.cache_path / "results" if not remote else self.remote_results_path
    )

    if isinstance(model_name, ModelMeta):
        if model_revision is not None:
            logger.warning(
                "model_revision and experiment_name is ignored when model_name is a ModelMeta object"
            )
        model_revision = model_name.revision
        experiment_name = model_name.experiment_name
        model_name = model_name.model_name_as_path()
    elif isinstance(model_name, str):
        model_name = model_name.replace("/", "__").replace(" ", "_")

    model_path = results_folder / model_name

    if model_revision is None:
        msg = "`model_revision` is not specified, attempting to load the latest revision. To disable this behavior, specify the 'model_revision` explicitly."
        logger.warning(msg)
        warnings.warn(msg)
        # get revs from paths
        revisions = [p for p in model_path.glob("*") if p.is_dir()]
        if not revisions:
            model_revision = "no_revision_available"
        else:
            if len(revisions) > 1:
                logger.warning(
                    f"Multiple revisions found for model {model_name}: {revisions}. Using the latest one (according to latest edit)."
                )
                # sort folder by latest edit time
                revisions.sort(key=lambda p: p.stat().st_mtime, reverse=True)
            model_revision = revisions[0].name

    if experiment_name:
        return (
            model_path
            / model_revision
            / _EXPERIMENTS_FOLDER_NAME
            / experiment_name
            / f"{task_name}.json"
        )
    return model_path / model_revision / f"{task_name}.json"

`load_results(models=None, tasks=None, *, require_model_meta=True, include_remote=True, validate_and_filter=False, only_main_score=False, load_experiments=LoadExperimentEnum.MATCH_KWARGS, experiment_kwargs=None)` ¶

Loads the results from the cache directory and returns a BenchmarkResults object.

Parameters:

Name	Type	Description	Default
`models`	`Sequence[str] \| Iterable[ModelMeta] \| None`	A list of model names to load the results for. If None it will load the results for all models.	`None`
`tasks`	`Sequence[str] \| Iterable[AbsTask] \| Benchmark \| Sequence[Benchmark] \| str \| None`	A list of task names to load the results for. If str is passed, then benchmark will be loaded. If Benchmark is passed, then all tasks in the benchmark will be loaded. If None it will load the results for all tasks.	`None`
`require_model_meta`	`bool`	If True it will ignore results that do not have a model_meta.json file. If false it attempt to extract the model name and revision from the path.	`True`
`include_remote`	`bool`	If True, it will include results from the remote repository.	`True`
`validate_and_filter`	`bool`	If True it will validate that the results object for the task contains the correct splits and filter out splits from the results object that are not default in the task metadata.	`False`
`only_main_score`	`bool`	If True, only the main score will be loaded.	`False`
`load_experiments`	`LoadExperimentEnum \| str`	If True, it will also load results from experiment folders.	`MATCH_KWARGS`
`experiment_kwargs`	`Mapping[str, Any] \| list[Mapping[str, Any]] \| None`	If specified, it will only load results from experiments with the specified kwargs. Only used if load_experiments is True.	`None`

Returns:

Type	Description
`BenchmarkResults`	A BenchmarkResults object containing the results for the specified models and tasks.

Examples:

>>> import mteb
>>> cache = mteb.ResultCache()
>>>
>>> # Load results for specific models and tasks
>>> results = cache.load_results(
...     models=["sentence-transformers/all-MiniLM-L6-v2"],
...     tasks=["STS12"],
...     require_model_meta=True,
... )

Source code in mteb/cache/result_cache.py

def load_results(
    self,
    models: Sequence[str] | Iterable[ModelMeta] | None = None,
    tasks: Sequence[str]
    | Iterable[AbsTask]
    | Benchmark
    | Sequence[Benchmark]
    | str
    | None = None,
    *,
    require_model_meta: bool = True,
    include_remote: bool = True,
    validate_and_filter: bool = False,
    only_main_score: bool = False,
    load_experiments: LoadExperimentEnum | str = LoadExperimentEnum.MATCH_KWARGS,
    experiment_kwargs: Mapping[str, Any] | list[Mapping[str, Any]] | None = None,
) -> BenchmarkResults:
    """Loads the results from the cache directory and returns a BenchmarkResults object.

    Args:
        models: A list of model names to load the results for. If None it will load the results for all models.
        tasks: A list of task names to load the results for. If str is passed, then benchmark will be loaded.
            If Benchmark is passed, then all tasks in the benchmark will be loaded.
            If None it will load the results for all tasks.
        require_model_meta: If True it will ignore results that do not have a model_meta.json file. If false it attempt to
            extract the model name and revision from the path.
        include_remote: If True, it will include results from the remote repository.
        validate_and_filter: If True it will validate that the results object for the task contains the correct splits and filter out
            splits from the results object that are not default in the task metadata.
        only_main_score: If True, only the main score will be loaded.
        load_experiments: If True, it will also load results from experiment folders.
        experiment_kwargs: If specified, it will only load results from experiments with the specified kwargs. Only used if load_experiments is True.

    Returns:
        A BenchmarkResults object containing the results for the specified models and tasks.

    Examples:
        >>> import mteb
        >>> cache = mteb.ResultCache()
        >>>
        >>> # Load results for specific models and tasks
        >>> results = cache.load_results(
        ...     models=["sentence-transformers/all-MiniLM-L6-v2"],
        ...     tasks=["STS12"],
        ...     require_model_meta=True,
        ... )
    """
    if isinstance(tasks, str):
        tasks = get_benchmark(tasks)

    benchmarks: Sequence[Benchmark] | Benchmark | None = None
    if isinstance(tasks, Sequence) and isinstance(tasks[0], Benchmark):
        benchmarks = tasks  # type: ignore[assignment]
        tasks = [sub for task in tasks for sub in cast("Benchmark", task).tasks]

    if isinstance(tasks, Benchmark):
        benchmarks = tasks

    if isinstance(load_experiments, str):
        load_experiments = LoadExperimentEnum.from_str(load_experiments)

    if (
        load_experiments is not LoadExperimentEnum.MATCH_KWARGS
        and experiment_kwargs is not None
    ):
        warnings.warn(
            "experiment_kwargs is specified but load_experiments is not set to MATCH_KWARGS."
            "No results will be loaded."
        )

    models_as_model_meta = models is not None and isinstance(
        next(iter(models)), ModelMeta
    )

    paths = self.get_cache_paths(
        models=models,
        tasks=tasks,  # type: ignore[arg-type]
        require_model_meta=require_model_meta,
        include_remote=include_remote,
        load_experiments=load_experiments,
    )
    models_results = defaultdict(list)

    task_names: dict[str, AbsTask | None] = {}
    if tasks is not None:
        for task in tasks:
            if isinstance(task, AbsTask):
                task_names[task.metadata.name] = task
            else:
                task_names[cast("str", task)] = None

    experiment_names = set()
    if isinstance(experiment_kwargs, Mapping):
        experiment_kwargs = [experiment_kwargs]
    if isinstance(experiment_kwargs, list):
        experiment_names = {
            _serialize_experiment_kwargs_to_name(params)
            for params in experiment_kwargs
        }
    for path in paths:
        task_result = TaskResult.from_disk(path)

        if only_main_score:
            task_result = task_result.only_main_score()
        model_name, revision, experiment_name = (
            self._get_model_name_and_revision_from_path(path.parent)
        )

        if validate_and_filter:
            task_instance = task_names[task_result.task_name]
            try:
                task_result = task_result.validate_and_filter_scores(
                    task=task_instance
                )
            except ValidationError as e:
                logger.info(
                    f"Validation failed for {task_result.task_name} in {model_name} {revision}: {e}"
                )
                continue

        if len(experiment_names) > 0 and experiment_name not in experiment_names:
            logger.debug(
                f"Skipping experiment {experiment_name} as it is not in the specified experiment names"
            )
            continue

        if (
            load_experiments is LoadExperimentEnum.MATCH_KWARGS
            and not models_as_model_meta  # for models meta path are prefiltered
            and len(experiment_names) == 0
            and experiment_name is not None
        ):
            continue

        models_results[(model_name, revision, experiment_name)].append(task_result)

    # create BenchmarkResults object
    models_results_object = [
        ModelResult(
            model_name=model_name,
            model_revision=revision,
            task_results=task_results,
            experiment_name=experiment_name,
        )
        for (
            model_name,
            revision,
            experiment_name,
        ), task_results in models_results.items()
    ]

    return BenchmarkResults(
        model_results=models_results_object,
        benchmark=benchmarks,
    )

`load_task_result(task_name, model_name, model_revision=None, raise_if_not_found=False, prioritize_remote=False, experiment_name=None)` ¶

Load the results from the local cache directory.

Parameters:

Name	Type	Description	Default
`task_name`	`str`	The name of the task.	required
`model_name`	`str \| ModelMeta`	The name of the model as a valid directory name or a ModelMeta object.	required
`model_revision`	`str \| None`	The revision of the model. Must be specified if model_name is a string.	`None`
`raise_if_not_found`	`bool`	If True, raise an error if the results are not found.	`False`
`prioritize_remote`	`bool`	If True, it will first try to load the results from the remote repository, if available.	`False`
`experiment_name`	`str \| None`	Optional experiment folder name (a valid directory name). If None, the default is used.	`None`

Returns:

Type	Description
`TaskResult \| None`	The results of the task, or None if not found.

Source code in mteb/cache/result_cache.py

def load_task_result(
    self,
    task_name: str,
    model_name: str | ModelMeta,
    model_revision: str | None = None,
    raise_if_not_found: bool = False,
    prioritize_remote: bool = False,
    experiment_name: str | None = None,
) -> TaskResult | None:
    """Load the results from the local cache directory.

    Args:
        task_name: The name of the task.
        model_name: The name of the model as a valid directory name or a ModelMeta object.
        model_revision: The revision of the model. Must be specified if model_name is a string.
        raise_if_not_found: If True, raise an error if the results are not found.
        prioritize_remote: If True, it will first try to load the results from the remote repository, if available.
        experiment_name: Optional experiment folder name (a valid directory name). If None, the default is used.

    Returns:
        The results of the task, or None if not found.
    """
    result_path = self.get_task_result_path(
        model_name=model_name,
        model_revision=model_revision,
        task_name=task_name,
        experiment_name=experiment_name,
    )

    if self.has_remote:
        remote_result_path = self.get_task_result_path(
            model_name=model_name,
            model_revision=model_revision,
            task_name=task_name,
            remote=True,
            experiment_name=experiment_name,
        )
        if remote_result_path.exists() and prioritize_remote:
            result_path = remote_result_path
        elif not result_path.exists():
            result_path = remote_result_path

    if not result_path.exists():
        msg = f"Results for {model_name} on {task_name} not found in {result_path}"
        if raise_if_not_found:
            raise FileNotFoundError(msg)
        logger.debug(msg)
        return None

    return TaskResult.from_disk(result_path)

`save_to_cache(task_result, model_name, model_revision=None, *, encode_kwargs=None)` ¶

Save the task results to the local cache directory in the location {model_name}/{model_revision}/{task_name}.json.

Where model_name is a path-normalized model name. In addition we also save a model_meta.json in the revision folder to preserve the model metadata.

Parameters:

Name	Type	Description	Default
`task_result`	`TaskResult`	The results of the task.	required
`model_name`	`str \| ModelMeta`	The name of the model as a valid directory name or a ModelMeta object.	required
`model_revision`	`str \| None`	The revision of the model. Must be specified if model_name is a string.	`None`
`encode_kwargs`	`Mapping[str, Any] \| None`	The keyword arguments passed to the model's encode method during evaluation.	`None`

Source code in mteb/cache/result_cache.py

def save_to_cache(
    self,
    task_result: TaskResult,
    model_name: str | ModelMeta,
    model_revision: str | None = None,
    *,
    encode_kwargs: Mapping[str, Any] | None = None,
) -> None:
    """Save the task results to the local cache directory in the location {model_name}/{model_revision}/{task_name}.json.

    Where model_name is a path-normalized model name.
    In addition we also save a model_meta.json in the revision folder to preserve the model metadata.

    Args:
        task_result: The results of the task.
        model_name: The name of the model as a valid directory name or a ModelMeta object.
        model_revision: The revision of the model. Must be specified if model_name is a string.
        encode_kwargs: The keyword arguments passed to the model's encode method during evaluation.
    """
    result_path = self.get_task_result_path(
        model_name=model_name,
        model_revision=model_revision,
        task_name=task_result.task_name,
    )
    result_path.parent.mkdir(parents=True, exist_ok=True)
    task_result.to_disk(result_path)

    model_meta_path = result_path.parent / "model_meta.json"
    if isinstance(model_name, ModelMeta):
        meta = model_name
        with model_meta_path.open("w") as f:
            json.dump(meta.to_dict(), f, default=str, indent=4)

    version_dict = _get_package_versions()

    run_settings_list: list[dict[str, Any]] = []
    for split, split_scores in task_result.scores.items():
        for score_entry in split_scores:
            hf_subset = score_entry.get("hf_subset", "default")
            run_settings = {
                "task": task_result.task_name,
                "split": split,
                "subset": hf_subset,
                "version": version_dict,
                "encode_kwargs": json.loads(json.dumps(encode_kwargs, default=str))
                if encode_kwargs is not None
                else {},
            }
            run_settings_list.append(run_settings)

    if run_settings_list:
        run_settings_path = result_path.parent / "run_settings.jsonl"
        _write_and_merge_keyed_json(run_settings_path, run_settings_list)

`submit_results(models=None, *, create_pr=False)` ¶

Create a commit of the results to the official MTEB results repository (https://github.com/embeddings-benchmark/results).

It does this by downloading the remote (if not downloaded already) and submitting the diff from the local result to the repository. Requires PyGithub to be installed if create_pr=True.

Parameters:

Name	Type	Description	Default
`models`	`Sequence[str] \| Sequence[ModelMeta] \| str \| ModelMeta \| None`	Model(s) whose results should be submitted. Can either a list of string or ModelMeta objects. If None it will get all models from local cache.	`None`
`create_pr`	`bool`	If True, create a PR directly to the remote. If False, prints instructions for manual submission.	`False`

Returns:

Type	Description
`SubmitResultsResponse`	Dictionary containing submission metadata: - status: "ready_for_submission" or "pr_created" - models_submitted: list of (model_name, revision) tuples - result_count: number of result files submitted - pr_url: URL to created PR (only if create_pr=True) - pr_number: PR number (only if create_pr=True) - fork_url: URL to user's fork (only if create_pr=True)

Raises:

Type	Description
`ValueError`	If no models found or invalid input.
`RuntimeError`	If git operations fail.
`ImportError`	If create_pr=True and PyGithub is not installed.
`GithubException`	If GitHub API operations fail.

Examples:

>>> import mteb
>>> cache = mteb.ResultCache()
>>> model_meta = mteb.get_model_meta(...)
>>> tasks = mteb.get_tasks(...)
>>> results = mteb.evaluate(model_meta, tasks, cache=cache)
>>>
>>> # Manual submission (step-by-step)
>>> submission = cache.submit_results(model_meta, create_pr=False)
>>> # Follow printed instructions
>>>
>>> # Automated submission
>>> submission = cache.submit_results(model_meta, create_pr=True)
>>> print(f"PR created: {submission['pr_url']}")

Source code in mteb/cache/result_cache.py

def submit_results(
    self,
    models: Sequence[str] | Sequence[ModelMeta] | str | ModelMeta | None = None,
    *,
    create_pr: bool = False,
) -> SubmitResultsResponse:
    """Create a commit of the results to the official MTEB results repository (https://github.com/embeddings-benchmark/results).

    It does this by downloading the remote (if not downloaded already) and
    submitting the diff from the local result to the repository. Requires PyGithub
    to be installed if `create_pr=True`.

    Args:
        models: Model(s) whose results should be submitted. Can either a list of string or ModelMeta objects.
            If None it will get all models from local cache.
        create_pr: If True, create a PR directly to the remote. If False, prints
              instructions for manual submission.

    Returns:
        Dictionary containing submission metadata:
            - status: "ready_for_submission" or "pr_created"
            - models_submitted: list of (model_name, revision) tuples
            - result_count: number of result files submitted
            - pr_url: URL to created PR (only if create_pr=True)
            - pr_number: PR number (only if create_pr=True)
            - fork_url: URL to user's fork (only if create_pr=True)

    Raises:
        ValueError: If no models found or invalid input.
        RuntimeError: If git operations fail.
        ImportError: If create_pr=True and PyGithub is not installed.
        GithubException: If GitHub API operations fail.

    Examples:
        >>> import mteb
        >>> cache = mteb.ResultCache()
        >>> model_meta = mteb.get_model_meta(...)
        >>> tasks = mteb.get_tasks(...)
        >>> results = mteb.evaluate(model_meta, tasks, cache=cache)
        >>>
        >>> # Manual submission (step-by-step)
        >>> submission = cache.submit_results(model_meta, create_pr=False)
        >>> # Follow printed instructions
        >>>
        >>> # Automated submission
        >>> submission = cache.submit_results(model_meta, create_pr=True)
        >>> print(f"PR created: {submission['pr_url']}")
    """
    # Always create a new branch to keep the original branch clean
    branch_name = f"mteb-results-{int(datetime.now().timestamp())}"
    normalized_models = self._normalize_models(models)

    try:
        self.download_from_remote()
        unsubmitted = self._get_unsubmitted_results(normalized_models)

        if not unsubmitted:
            logger.warning("No unsubmitted results found.")
            return SubmitResultsResponse(
                status="no_changes",
                models_submitted=[(m.name, m.revision) for m in normalized_models],
                result_count=0,
            )

        remote_path = self.remote_repo_path
        check_uncommitted_changes(remote_path)
        check_detached_head(remote_path)
        logger.info("Pre-flight checks passed.")

        # Capture original branch before making any changes
        original_branch = get_current_branch(remote_path)

    except RuntimeError as e:
        logger.error(f"Setup error during submit_results: {e}")
        raise
    except Exception as e:
        logger.error(f"Error during submit_results setup: {e}")
        raise

    actions: list[ReversibleAction] = [
        CreateBranchAction(remote_path, branch_name, original_branch),
        CopyResultsAction(unsubmitted, self.remote_results_path),
    ]

    commit_message, result_count = _build_commit_message(
        normalized_models, unsubmitted
    )

    actions.append(CommitAction(remote_path, commit_message))

    workflow = ReversibleWorkflow(steps=actions)
    workflow.run()

    if not create_pr:
        # For manual submission, stay on the submission branch so the commit is accessible
        # The user will push this branch to their fork and create a PR
        message = _build_manual_submission_message(
            remote_path, result_count, len(normalized_models), branch_name
        )
        logger.info("%s", message)

        return SubmitResultsResponse(
            status="ready_for_submission",
            models_submitted=[(m.name, m.revision) for m in normalized_models],
            result_count=result_count,
            path=str(remote_path),
        )

    pr_body = _prepare_pr_body(normalized_models, unsubmitted)
    return handle_pr_creation_with_cleanup(
        remote_repo_path=remote_path,
        original_branch=original_branch,
        branch_name=branch_name,
        models=normalized_models,
        result_count=result_count,
        pr_body=pr_body,
    )

Result Objects¶

`mteb.results.TaskResult` ¶

Bases: BaseModel

A class to represent the MTEB result.

Attributes:

Name	Type	Description
`task_name`	`str`	The name of the MTEB task.
`dataset_revision`	`str`	The revision dataset for the task on HuggingFace dataset hub.
`mteb_version`	`str \| None`	The version of the MTEB used to evaluate the model.
`scores`	`dict[SplitName, list[ScoresDict]]`	The scores of the model on the dataset. The scores is a dictionary with the following structure; dict[SplitName, list[Scores]]. Where Scores is a dictionary with the following structure; dict[str, Any]. Where the keys and values are scores. Split is the split of the dataset.
`evaluation_time`	`float \| None`	The time taken to evaluate the model.
`kg_co2_emissions`	`float \| None`	The kg of CO2 emissions produced by the model during evaluation.

Examples:

>>> scores = {
...     "evaluation_time": 100,
...     "train": {
...         "en-de": {
...             "main_score": 0.5,
...         },
...         "en-fr": {
...             "main_score": 0.6,
...         },
...     },
... }
>>> sample_task = ... # some MTEB task
>>> mteb_results = TaskResult.from_task_results(sample_task, scores)
>>> mteb_results.get_score()  # get the main score for all languages
0.55
>>> mteb_results.get_score(languages=["fra"])  # get the main score for French
0.6
>>> mteb_results.to_dict()
{'dataset_revision': '1.0', 'task_name': 'sample_task', 'mteb_version': '1.0.0', 'evaluation_time': 100, 'scores': {'train':
    [
        {'main_score': 0.5, 'hf_subset': 'en-de', 'languages': ['eng-Latn', 'deu-Latn']},
        {'main_score': 0.6, 'hf_subset': 'en-fr', 'languages': ['eng-Latn', 'fra-Latn']}
    ]}
}

Source code in mteb/results/task_result.py

class TaskResult(BaseModel):  # noqa: PLR0904
    """A class to represent the MTEB result.

    Attributes:
        task_name: The name of the MTEB task.
        dataset_revision: The revision dataset for the task on HuggingFace dataset hub.
        mteb_version: The version of the MTEB used to evaluate the model.
        scores: The scores of the model on the dataset. The scores is a dictionary with the following structure; dict[SplitName, list[Scores]].
            Where Scores is a dictionary with the following structure; dict[str, Any]. Where the keys and values are scores. Split is the split of
            the dataset.
        evaluation_time: The time taken to evaluate the model.
        kg_co2_emissions: The kg of CO2 emissions produced by the model during evaluation.

    Examples:
        >>> scores = {
        ...     "evaluation_time": 100,
        ...     "train": {
        ...         "en-de": {
        ...             "main_score": 0.5,
        ...         },
        ...         "en-fr": {
        ...             "main_score": 0.6,
        ...         },
        ...     },
        ... }
        >>> sample_task = ... # some MTEB task
        >>> mteb_results = TaskResult.from_task_results(sample_task, scores)
        >>> mteb_results.get_score()  # get the main score for all languages
        0.55
        >>> mteb_results.get_score(languages=["fra"])  # get the main score for French
        0.6
        >>> mteb_results.to_dict()
        {'dataset_revision': '1.0', 'task_name': 'sample_task', 'mteb_version': '1.0.0', 'evaluation_time': 100, 'scores': {'train':
            [
                {'main_score': 0.5, 'hf_subset': 'en-de', 'languages': ['eng-Latn', 'deu-Latn']},
                {'main_score': 0.6, 'hf_subset': 'en-fr', 'languages': ['eng-Latn', 'fra-Latn']}
            ]}
        }
    """

    dataset_revision: str
    task_name: str
    mteb_version: str | None
    scores: dict[SplitName, list[ScoresDict]]
    evaluation_time: float | None
    kg_co2_emissions: float | None = None
    date: datetime.datetime | None = None

    @classmethod
    def from_task_results(
        cls,
        task: AbsTask | type[AbsTask],
        scores: dict[SplitName, Mapping[HFSubset, ScoresDict]],
        evaluation_time: float,
        kg_co2_emissions: float | None = None,
        date: datetime.datetime | None = None,
    ) -> TaskResult:
        """Create a TaskResult from the task and scores.

        Args:
            task: The task to create the TaskResult from.
            scores: The scores of the model on the dataset. The scores is a dictionary with the following structure; dict[SplitName, dict[HFSubset, Scores]].
                Where Scores is a dictionary with the following structure; dict[str, Any]. Where the keys and values are scores. Split is the split of
                the dataset.
            evaluation_time: The time taken to evaluate the model.
            kg_co2_emissions: The kg of CO2 emissions produced by the model during evaluation.
            date: The date the model was trained on.
        """
        task_meta = task.metadata
        subset2langscripts = task_meta.hf_subsets_to_langscripts
        mteb_ver = version("mteb")
        flat_scores = defaultdict(list)
        for split, hf_subset_scores in scores.items():
            for hf_subset, hf_scores in hf_subset_scores.items():
                if hf_subset in subset2langscripts:
                    eval_langs = subset2langscripts[hf_subset]
                else:
                    # For aggregated tasks, scores may use "default" subset
                    # which isn't in the per-subset langscript mapping.
                    # Collect all languages from the mapping.
                    all_langs: list[str] = []
                    for langs in subset2langscripts.values():
                        all_langs.extend(
                            lang for lang in langs if lang not in all_langs
                        )
                    eval_langs = all_langs
                _scores = {
                    **hf_scores,
                    "hf_subset": hf_subset,
                    "languages": eval_langs,
                    "mteb_version": mteb_ver,
                }
                flat_scores[split].append(_scores)

        return TaskResult(
            dataset_revision=task.metadata.revision,
            task_name=task.metadata.name,
            mteb_version=mteb_ver,
            scores=flat_scores,
            evaluation_time=evaluation_time,
            kg_co2_emissions=kg_co2_emissions,
            date=date,
        )

    @field_validator("scores")
    @classmethod
    def _validate_scores(
        cls, v: dict[SplitName, list[ScoresDict]]
    ) -> dict[SplitName, list[ScoresDict]]:
        for split, hf_subset_scores in v.items():
            for hf_subset_score in hf_subset_scores:
                if not isinstance(hf_subset_score, dict):
                    raise ValueError("Scores should be a dictionary")
                cls._validate_scores_dict(hf_subset_score)
        return v

    @model_validator(mode="after")
    def _backfill_per_subset_mteb_version(self) -> Self:
        """Backfill mteb_version from top-level into subsets that lack it."""
        if self.mteb_version is None:
            return self
        for split_scores in self.scores.values():
            for subset_scores in split_scores:
                if "mteb_version" not in subset_scores:
                    subset_scores["mteb_version"] = self.mteb_version  # type: ignore[index]
        return self

    @staticmethod
    def _validate_scores_dict(scores: ScoresDict) -> None:
        if "main_score" not in scores:
            raise ValueError("'main_score' should be in scores")
        if "hf_subset" not in scores or not isinstance(scores["hf_subset"], str):
            raise ValueError("hf_subset should be in scores and should be a string")
        if "languages" not in scores or not isinstance(scores["languages"], list):
            raise ValueError("languages should be in scores and should be a list")

        # check that it is json serializable
        try:
            _ = json.dumps(scores)
        except Exception as e:
            raise ValueError(f"Scores are not json serializable: {e}")

    @property
    def languages(self) -> list[str]:
        """Get the languages present in the scores."""
        langs = []
        for split, split_res in self.scores.items():
            for entry in split_res:
                langs.extend([lang.split("-")[0] for lang in entry["languages"]])
        return list(set(langs))

    @cached_property
    def task(self) -> AbsTask:
        """Get the task associated with the result."""
        from mteb.get_tasks import get_task

        return get_task(self.task_name)

    @property
    def domains(self) -> list[TaskDomain]:
        """Get the domains of the task."""
        doms = self.task.metadata.domains
        if doms is None:
            doms = []
        return doms

    @property
    def task_type(self) -> str:
        """Get the type of the task."""
        return cast("str", self.task.metadata.type)

    @property
    def is_public(self) -> bool:
        """Check if the task is public."""
        return self.task.metadata.is_public

    @property
    def main_score(self) -> float:
        """Get the main score of the result."""
        return self.get_score()

    @property
    def hf_subsets(self) -> list[str]:
        """Get the hf_subsets present in the scores."""
        hf_subsets = set()
        for split, split_res in self.scores.items():
            for entry in split_res:
                hf_subsets.add(entry["hf_subset"])
        return list(hf_subsets)

    @property
    def eval_splits(self) -> list[str]:
        """Get the eval splits present in the scores."""
        return list(self.scores.keys())

    def to_dict(self) -> dict[str, Any]:
        """Convert the TaskResult to a dictionary.

        Returns:
            The TaskResult as a dictionary.
        """
        return self.model_dump()

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> Self:
        """Create a TaskResult from a dictionary.

        Args:
            data: The dictionary to create the TaskResult from.

        Returns:
            The created TaskResult object.
        """
        return cls.model_validate(data)

    def _round_scores(self, scores: dict[SplitName, list[ScoresDict]], n: int) -> None:
        """Recursively round scores to n decimal places"""
        for key, value in scores.items():
            if isinstance(value, dict):
                self._round_scores(value, n)
            elif isinstance(value, list):
                for i, v in enumerate(value):
                    if isinstance(v, dict):
                        self._round_scores(v, n)
                    elif isinstance(v, float):
                        value[i] = round(v, n)  # type: ignore[call-overload]

            elif isinstance(value, float):
                scores[key] = round(value, n)

    def to_disk(self, path: Path) -> None:
        """Save TaskResult to disk.

        Args:
            path: The path to the file to save.
        """
        json_obj = self.model_dump()
        json_obj["date"] = self.date.timestamp() if self.date else None
        self._round_scores(json_obj["scores"], 6)

        with path.open("w") as f:  # noqa: PLW1514
            json.dump(json_obj, f, indent=2)

    @classmethod
    def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult:
        """Load TaskResult from disk.

        Args:
            path: The path to the file to load.
            load_historic_data: Whether to attempt to load historic data from before v1.11.0.

        Returns:
            The loaded TaskResult object.
        """
        with path.open("r", encoding="utf-8") as f:
            json_str = f.read()

        if not load_historic_data:
            try:
                return cls.model_validate_json(json_str)
            except Exception as e:
                raise ValueError(
                    f"Error loading TaskResult from disk. You can try to load historic data by setting `load_historic_data=True`. Error: {e}"
                )
        data = json.loads(json_str)
        pre_1_11_load = (
            (
                "mteb_version" in data
                and data["mteb_version"] is not None
                and Version(data["mteb_version"]) < Version("1.11.0")
            )
            or "mteb_version" not in data
        )  # assume it is before 1.11.0 if the version is not present

        try:
            obj: TaskResult = cls.model_validate_json(json_str)
        except Exception as e:
            if not pre_1_11_load:
                raise e
            logger.debug(
                f"Could not load TaskResult from disk, got error: {e}. Attempting to load from disk using format from before v1.11.0"
            )
            obj = cls._convert_from_before_v1_11_0(data)

        pre_v_12_48 = (
            "mteb_version" in data
            and data["mteb_version"] is not None
            and Version(data["mteb_version"]) < Version("1.12.48")
        )

        if pre_v_12_48:
            cls._fix_pair_classification_scores(obj)

        return obj

    @classmethod
    def _fix_pair_classification_scores(cls, obj: TaskResult) -> None:
        from mteb import get_task

        task_name = obj.task_name
        task: AbsTask | type[AbsTask]
        if task_name in outdated_tasks:
            task = outdated_tasks[task_name]
        else:
            task = get_task(obj.task_name)

        if task.metadata.type == "PairClassification":  # noqa: PLR1702
            for split, split_scores in obj.scores.items():
                for hf_subset_scores in split_scores:
                    # concatenate score e.g. ["max"]["ap"] -> ["max_ap"]
                    for key in list(hf_subset_scores.keys()):
                        if isinstance(hf_subset_scores[key], dict):
                            for k, v in hf_subset_scores[key].items():
                                hf_subset_scores[f"{key}_{k}"] = v  # type: ignore[index]
                            hf_subset_scores.pop(key)  # type: ignore[attr-defined]

    @classmethod
    def _convert_from_before_v1_11_0(cls, data: dict[str, Any]) -> TaskResult:
        from mteb.get_tasks import _TASKS_REGISTRY

        # in case the task name is not found in the registry, try to find a lower case version
        lower_case_registry = {k.lower(): v for k, v in _TASKS_REGISTRY.items()}

        scores = {**data}

        dataset_revision = scores.pop(
            "dataset_revision", "dataset revision not available"
        )
        task_name = scores.pop("mteb_dataset_name")
        mteb_version = scores.pop("mteb_version", "mteb version not available")

        # calculate evaluation time across all splits (move to top level)
        evaluation_time = 0
        for split, split_score in scores.items():
            if "evaluation_time" in split_score:
                evaluation_time += split_score.pop("evaluation_time")

        # normalize the scores to always be {split: {hf_subset: scores}}
        contains_hf_subset = any(
            isinstance(hf_subset_scores, dict)
            for split_scores in scores.values()
            for k, hf_subset_scores in split_scores.items()
            if k
            not in {"v_measures", "cos_sim", "euclidean", "manhattan", "dot", "max"}
        )
        if not contains_hf_subset:
            for split, split_score in scores.items():
                scores[split] = {"default": split_score.copy()}

        if task_name in outdated_tasks:
            logger.debug(
                f"Loading {task_name} as a dummy task as it no longer exists within MTEB. To avoid this set `load_historic_data=False`"
            )
            task = outdated_tasks[task_name]
        else:
            if task_name in renamed_tasks:
                task_name = renamed_tasks[task_name]
            task = _TASKS_REGISTRY.get(
                task_name, lower_case_registry[task_name.lower()]
            )

        # make sure that main score exists
        main_score = task.metadata.main_score
        for split, split_score in scores.items():
            for hf_subset, hf_subset_scores in split_score.items():
                for name, prev_name in [
                    (ScoringFunction.COSINE.value, "cos_sim"),
                    (ScoringFunction.MANHATTAN.value, "manhattan"),
                    (ScoringFunction.EUCLIDEAN.value, "euclidean"),
                    (ScoringFunction.DOT_PRODUCT.value, "dot"),
                    ("max", "max"),
                    ("similarity", "similarity"),
                ]:
                    prev_name_scores = hf_subset_scores.pop(prev_name, None)
                    if prev_name_scores is not None:
                        for k, v in prev_name_scores.items():
                            hf_subset_scores[f"{name}_{k}"] = v

                if "main_score" not in hf_subset_scores:
                    if main_score in hf_subset_scores:
                        hf_subset_scores["main_score"] = hf_subset_scores[main_score]
                    else:
                        log_once.warning(f"Main score {main_score} not found in scores")
                        hf_subset_scores["main_score"] = None

        # specific fixes:
        if task_name == "MLSUMClusteringP2P" and mteb_version in [  # noqa: PLR6201
            "1.1.2.dev0",
            "1.1.3.dev0",
        ]:  # back then it was only the french subsection which was implemented
            scores["test"]["fr"] = scores["test"].pop("default")
        if task_name == "MLSUMClusteringS2S" and mteb_version in [  # noqa: PLR6201
            "1.1.2.dev0",
            "1.1.3.dev0",
        ]:
            scores["test"]["fr"] = scores["test"].pop("default")
        if task_name == "XPQARetrieval":  # subset were renamed from "fr" to "fra-fra"
            if "test" in scores and "fr" in scores["test"]:
                scores["test"]["fra-fra"] = scores["test"].pop("fr")

        result: TaskResult = TaskResult.from_task_results(
            task,
            scores,
            evaluation_time,
            kg_co2_emissions=None,
        )
        result.dataset_revision = dataset_revision
        result.mteb_version = mteb_version
        return result

    def get_score(
        self,
        splits: list[SplitName] | None = None,
        languages: list[ISOLanguage | ISOLanguageScript] | None = None,
        scripts: list[ISOLanguageScript] | None = None,
        getter: Callable[[ScoresDict], Score] = lambda scores: scores["main_score"],
        aggregation: Callable[[list[Score]], float] = np.mean,
    ) -> float:
        """Get a score for the specified splits, languages, scripts and aggregation function.

        Args:
            splits: The splits to consider.
            languages: The languages to consider. Can be ISO language codes or ISO language script codes.
            scripts: The scripts to consider.
            getter: A function that takes a scores dictionary and returns a score e.g. "main_score" or "evaluation_time".
            aggregation: The aggregation function to use.

        Returns:
            The result of the aggregation function on the scores.
        """
        if splits is None:
            splits = list(self.scores.keys())

        lang_scripts = LanguageScripts.from_languages_and_scripts(languages, scripts)

        values = []
        for split in splits:
            if split not in self.scores:
                raise ValueError(f"Split {split} not found in scores")

            for scores in self.scores[split]:
                eval_langs = scores["languages"]
                for lang in eval_langs:
                    if lang_scripts.contains_language(lang):
                        values.append(getter(scores))
                        break

        return aggregation(values)

    def _get_score_fast(
        self,
        splits: Iterable[str] | None = None,
        languages: list[ISOLanguage | ISOLanguageScript] | None = None,
        subsets: Iterable[str] | None = None,
    ) -> float:
        """Sped up version of get_score that will be used if no aggregation, script or getter needs to be specified.

        Args:
            splits: The splits to consider.
            languages: The languages to consider. Can be ISO language codes or ISO language script codes.
            subsets: The hf_subsets to consider.

        Returns:
            The mean main score for the specified splits, languages and subsets.
        """
        if splits is None:
            splits = self.scores.keys()
        val_sum = 0
        n_val = 0
        for split in splits:
            if split not in self.scores:
                raise ValueError(f"Split missing from scores: {split}")

            for scores in self.scores[split]:
                langs = scores["languages"]
                hf_subset = scores["hf_subset"]
                main_score = scores.get("main_score", None)
                if main_score is None:
                    raise ValueError(f"Missing main score for subset: {hf_subset}")
                if subsets and hf_subset not in subsets:
                    continue
                elif subsets:
                    val_sum += main_score
                    n_val += 1
                    continue

                if languages is None:
                    val_sum += main_score
                    n_val += 1
                    continue
                for lang in langs:
                    if lang.split("-")[0] in languages:
                        val_sum += main_score
                        n_val += 1
                        logger.info(f"{val_sum=}, {n_val=}")
                        break
        if n_val == 0:
            raise ValueError("No splits had scores for the specified languages.")
        return val_sum / n_val

    @classmethod
    def from_validated(cls, **data: Any) -> TaskResult:
        """Create a TaskResult from validated data.

        Returns:
            The created TaskResult object.
        """
        return cls.model_construct(**data)

    def __repr__(self) -> str:
        return f"TaskResult(task_name={self.task_name}, main_score={self.main_score:.2f}, scores=..., ...)"

    def only_main_score(self) -> TaskResult:
        """Return a new TaskResult object with only the main score.

        Returns:
            A new TaskResult object with only the main score.
        """
        new_scores: dict[str, list[Score]] = {}
        for split in self.scores:
            new_scores[split] = []
            for subset_scores in self.scores[split]:
                new_scores[split].append(
                    {
                        "hf_subset": subset_scores.get("hf_subset", "default"),
                        "main_score": subset_scores.get("main_score", np.nan),
                        "languages": subset_scores.get("languages", []),
                    }
                )
        new_res = {**self.to_dict(), "scores": new_scores}
        return TaskResult.from_validated(**new_res)

    def validate_and_filter_scores(
        self,
        task: AbsTask | None = None,
    ) -> TaskResult:
        """Validate and filter the scores against the task metadata.

        This ensures that the scores are correct for the given task, by removing any splits besides those specified in the task metadata.
        Additionally it also ensure that all of the splits required as well as the languages are present in the scores.
        Returns new TaskResult object.

        Args:
            task: The task to validate the scores against. E.g. if the task supplied is limited to certain splits and languages,
                the scores will be filtered to only include those splits and languages. If None it will attempt to get the task from the task_name.

        Returns:
            A new TaskResult object with the validated and filtered scores.
        """
        from mteb.get_tasks import get_task

        if task is None:
            task = get_task(self.task_name)

        splits = task.eval_splits
        hf_subsets = set(task.hf_subsets)  # Convert to set once

        new_scores: dict[str, list[Score]] = {}
        seen_splits = set()
        for split in self.scores:
            if split not in splits:
                continue
            seen_subsets = set()
            if task.is_aggregate:
                # aggregate tasks only have the default subset, but in metadata can be multiple
                new_scores[split] = [
                    _scores
                    for _scores in self.scores[split]
                    if _scores["hf_subset"] == "default"
                ]
                seen_subsets = {"default"}
            else:
                new_scores[split] = [
                    _scores
                    for _scores in self.scores[split]
                    if _scores["hf_subset"] in hf_subsets
                ]
            for _scores in new_scores[split]:
                seen_subsets.add(_scores["hf_subset"])

            if seen_subsets != hf_subsets and not (
                task.is_aggregate and "default" in seen_subsets
            ):
                missing_subsets = hf_subsets - seen_subsets
                if len(missing_subsets) > 2:
                    subset1, subset2 = list(missing_subsets)[:2]
                    missing_subsets_str = f"{{'{subset1}', '{subset2}', ...}}"
                else:
                    missing_subsets_str = str(missing_subsets)

                log_once.warning(
                    f"{task.metadata.name}: Missing subsets {missing_subsets_str} for split {split}"
                )
                for missing_subset in missing_subsets:
                    new_scores[split].append(
                        {
                            "hf_subset": missing_subset,
                            "main_score": np.nan,
                            "languages": task.metadata.hf_subsets_to_langscripts.get(
                                missing_subset, []
                            ),
                        }
                    )
            seen_splits.add(split)
        if seen_splits != set(splits):
            log_once.warning(
                f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
            )
            for missing_split in set(splits) - seen_splits:
                new_scores[missing_split] = []
                for missing_subset in hf_subsets:
                    new_scores[missing_split].append(
                        {
                            "hf_subset": missing_subset,
                            "main_score": np.nan,
                            "languages": task.metadata.hf_subsets_to_langscripts.get(
                                missing_subset, []
                            ),
                        }
                    )
        data = self.model_dump()
        data["scores"] = new_scores
        return type(self).model_construct(**data)

    def is_mergeable(
        self,
        result: TaskResult | AbsTask,
        criteria: list[str] | list[Criteria] = [
            "dataset_revision",
        ],
        raise_error: bool = False,
    ) -> bool:
        """Checks if the TaskResult object can be merged with another TaskResult or Task.

        Args:
            result: The TaskResult or Task object to check against.
            criteria: Additional criteria to check for merging. Can be "dataset_revision" or "mteb_version" (opt-in).
                It will always check that the task name match.
            raise_error: If True, raises an error if the objects cannot be merged. If False, returns False.

        Returns:
            True if the TaskResult object can be merged with the other object, False otherwise.
        """
        criteria = [Criteria.from_str(c) if isinstance(c, str) else c for c in criteria]
        if isinstance(result, TaskResult):
            name = result.task_name
            revision = result.dataset_revision
            mteb_version = result.mteb_version
        elif isinstance(result, AbsTask):
            mteb_version = version("mteb")
            name = result.metadata.name
            revision = result.metadata.revision
        else:
            msg = "result must be a TaskResult or AbsTask object"
            if raise_error:
                raise ValueError(msg)
            logger.debug(msg)
            return False

        if self.task_name != name:
            msg = f"Cannot merge TaskResult objects as they are derived from different tasks ({self.task_name} and {name})"
            if raise_error:
                raise ValueError(msg)
            logger.debug(msg)
            return False

        if Criteria.MTEB_VERSION in criteria and self.mteb_version != mteb_version:
            msg = f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} (loaded) and {mteb_version} (current))"
            if raise_error:
                raise ValueError(msg)
            logger.debug(msg)
            return False

        if Criteria.DATASET_REVISION in criteria and self.dataset_revision != revision:
            msg = f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})"
            if raise_error:
                raise ValueError(msg)
            logger.debug(msg)
            return False

        return True

    def merge(
        self,
        new_results: TaskResult,
        criteria: list[str] | list[Criteria] = [
            "dataset_revision",
        ],
    ) -> TaskResult:
        """Merges two TaskResult objects.

        Args:
            new_results: The new TaskResult object to merge with the current one.
            criteria: Additional criteria to check for merging. Can be "mteb_version" or "dataset_revision".
                It will always check that the task name match.

        Returns:
            A new TaskResult object with the merged scores.
        """
        self.is_mergeable(new_results, criteria=criteria, raise_error=True)

        merged_scores = self.scores.copy()

        for split, scores in new_results.scores.items():
            if split in merged_scores:
                merged_scores[split] = self._merge_split_scores(
                    merged_scores[split], scores
                )
            else:
                merged_scores[split] = scores

        existing_kg_co2_emissions = (
            self.kg_co2_emissions if self.kg_co2_emissions else 0
        )
        new_kg_co2_emissions = (
            new_results.kg_co2_emissions if new_results.kg_co2_emissions else 0
        )
        merged_kg_co2_emissions = None
        if existing_kg_co2_emissions and new_kg_co2_emissions:
            merged_kg_co2_emissions = existing_kg_co2_emissions + new_kg_co2_emissions

        merged_evaluation_time = None
        if self.evaluation_time and new_results.evaluation_time:
            merged_evaluation_time = self.evaluation_time + new_results.evaluation_time
        date = self.date
        if new_results.date is not None and (date is None or new_results.date > date):
            date = new_results.date
        mteb_ver = self._compute_top_level_mteb_version(merged_scores)

        merged_results = TaskResult(
            dataset_revision=new_results.dataset_revision,
            task_name=new_results.task_name,
            mteb_version=mteb_ver,
            scores=merged_scores,
            evaluation_time=merged_evaluation_time,
            kg_co2_emissions=merged_kg_co2_emissions,
            date=date,
        )

        return merged_results

    @staticmethod
    def _compute_top_level_mteb_version(
        scores: dict[SplitName, list[ScoresDict]],
    ) -> str | None:
        """Compute the top-level mteb_version from per-subset versions.

        Returns a version range (e.g. "2.12.0-2.12.19") if subsets were
        evaluated with different versions, a single version if all match,
        or None if no per-subset versions are present.
        """
        versions: set[str] = set()
        for split_scores in scores.values():
            for subset_scores in split_scores:
                v = subset_scores.get("mteb_version")
                if v is not None:
                    versions.add(v)
        if not versions:
            return None
        min_ver = str(min(Version(v) for v in versions))
        max_ver = str(max(Version(v) for v in versions))
        if min_ver == max_ver:
            return min_ver
        return f"{min_ver}-{max_ver}"

    @staticmethod
    def _merge_split_scores(
        existing_scores: list[ScoresDict], new_scores: list[ScoresDict]
    ) -> list[ScoresDict]:
        merged = {score["hf_subset"]: score for score in existing_scores}
        for score in new_scores:
            merged[score["hf_subset"]] = score
        return list(merged.values())

    def get_missing_evaluations(self, task: AbsTask) -> dict[str, list[str]]:
        """Checks which splits and subsets are missing from the results.

        Args:
            task: The task to check against.

        Returns:
            A dictionary with the splits as keys and a list of missing subsets as values.
        """
        missing_splits = {}
        for splits in task.eval_splits:
            if splits not in self.scores:  # split it fully missing
                missing_splits[splits] = task.hf_subsets
            if splits in self.scores:
                hf_subsets = {score["hf_subset"] for score in self.scores[splits]}
                missing_subsets = list(set(task.hf_subsets) - hf_subsets)
                if missing_subsets:
                    missing_splits[splits] = missing_subsets

        return missing_splits

    @deprecated(
        "HF deprecated `EvalResults` in favor of `Benchmarks` and it's results. "
        "To push new results use ModelMeta.push_eval_results()."
    )
    def get_hf_eval_results(self) -> list[EvalResult]:
        """Create HF evaluation results objects from TaskResult objects.

        Returns:
            List of EvalResult objects for each split and subset.
        """
        task_metadata = self.task.metadata
        task_type = task_metadata._hf_task_type()[0]
        results = []
        for split, scores in self.scores.items():
            for subset_results in scores:
                subset = subset_results.get("hf_subset", "default")
                results.append(
                    EvalResult(
                        task_type=task_type,
                        task_name=task_metadata.type,
                        dataset_type=task_metadata.dataset["path"],
                        dataset_name=f"{task_metadata.name} ({subset})",
                        dataset_config=subset,
                        dataset_split=split,
                        dataset_revision=task_metadata.dataset["revision"],
                        metric_type=task_metadata.main_score,
                        metric_name=task_metadata.main_score,
                        metric_value=subset_results["main_score"],
                        source_name="MTEB",
                        source_url="https://github.com/embeddings-benchmark/mteb/",
                    )
                )
        return results

    def _to_hf_benchmark_result(self, user: str | None = None) -> HFEvalResults:
        from mteb.get_tasks import get_task

        task_metadata = get_task(self.task_name).metadata
        dataset_id = task_metadata.dataset["path"]
        dataset_revision = task_metadata.dataset["revision"]
        eval_results = []
        evaluated_splits = set(self.scores.keys())
        evaluated_subsets = set()

        notes = f"Obtained using MTEB v{self.mteb_version}"
        source = HFEvalResultSource(
            url="https://github.com/embeddings-benchmark/mteb/",
            name=notes,
            user=user,
        )

        for split, split_results in self.scores.items():
            for subset_results in split_results:
                subset_name = subset_results.get("hf_subset", "default")
                task_id = f"{self.task_name}_{subset_name}_{split}"
                eval_results.append(
                    HFEvalResult(
                        dataset=HFEvalResultDataset(
                            id=dataset_id,
                            task_id=task_id,
                            revision=dataset_revision,
                        ),
                        value=round(subset_results["main_score"] * 100, 5),
                        source=source,
                        date=self.date,
                        notes=notes,
                    )
                )
                evaluated_subsets.add(subset_name)

        if len(evaluated_splits) == len(task_metadata.eval_splits) and len(
            evaluated_subsets
        ) == len(task_metadata.hf_subsets):
            # overall score
            eval_results.append(
                HFEvalResult(
                    dataset=HFEvalResultDataset(
                        id=dataset_id,
                        task_id=task_metadata.name,
                        revision=dataset_revision,
                    ),
                    value=round(self.get_score() * 100, 5),
                    source=source,
                    date=self.date,
                    notes=notes,
                )
            )
        return HFEvalResults.model_validate(eval_results)

`domains` `property` ¶

Get the domains of the task.

`eval_splits` `property` ¶

Get the eval splits present in the scores.

`hf_subsets` `property` ¶

Get the hf_subsets present in the scores.

`is_public` `property` ¶

Check if the task is public.

`languages` `property` ¶

Get the languages present in the scores.

`main_score` `property` ¶

Get the main score of the result.

`task` `cached` `property` ¶

Get the task associated with the result.

`task_type` `property` ¶

Get the type of the task.

`from_dict(data)` `classmethod` ¶

Create a TaskResult from a dictionary.

Parameters:

Name	Type	Description	Default
`data`	`dict[str, Any]`	The dictionary to create the TaskResult from.	required

Returns:

Type	Description
`Self`	The created TaskResult object.

Source code in mteb/results/task_result.py

@classmethod
def from_dict(cls, data: dict[str, Any]) -> Self:
    """Create a TaskResult from a dictionary.

    Args:
        data: The dictionary to create the TaskResult from.

    Returns:
        The created TaskResult object.
    """
    return cls.model_validate(data)

`from_disk(path, load_historic_data=True)` `classmethod` ¶

Load TaskResult from disk.

Parameters:

Name	Type	Description	Default
`path`	`Path`	The path to the file to load.	required
`load_historic_data`	`bool`	Whether to attempt to load historic data from before v1.11.0.	`True`

Returns:

Type	Description
`TaskResult`	The loaded TaskResult object.

Source code in mteb/results/task_result.py

@classmethod
def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult:
    """Load TaskResult from disk.

    Args:
        path: The path to the file to load.
        load_historic_data: Whether to attempt to load historic data from before v1.11.0.

    Returns:
        The loaded TaskResult object.
    """
    with path.open("r", encoding="utf-8") as f:
        json_str = f.read()

    if not load_historic_data:
        try:
            return cls.model_validate_json(json_str)
        except Exception as e:
            raise ValueError(
                f"Error loading TaskResult from disk. You can try to load historic data by setting `load_historic_data=True`. Error: {e}"
            )
    data = json.loads(json_str)
    pre_1_11_load = (
        (
            "mteb_version" in data
            and data["mteb_version"] is not None
            and Version(data["mteb_version"]) < Version("1.11.0")
        )
        or "mteb_version" not in data
    )  # assume it is before 1.11.0 if the version is not present

    try:
        obj: TaskResult = cls.model_validate_json(json_str)
    except Exception as e:
        if not pre_1_11_load:
            raise e
        logger.debug(
            f"Could not load TaskResult from disk, got error: {e}. Attempting to load from disk using format from before v1.11.0"
        )
        obj = cls._convert_from_before_v1_11_0(data)

    pre_v_12_48 = (
        "mteb_version" in data
        and data["mteb_version"] is not None
        and Version(data["mteb_version"]) < Version("1.12.48")
    )

    if pre_v_12_48:
        cls._fix_pair_classification_scores(obj)

    return obj

`from_task_results(task, scores, evaluation_time, kg_co2_emissions=None, date=None)` `classmethod` ¶

Create a TaskResult from the task and scores.

Parameters:

Name	Type	Description	Default
`task`	`AbsTask \| type[AbsTask]`	The task to create the TaskResult from.	required
`scores`	`dict[SplitName, Mapping[HFSubset, ScoresDict]]`	The scores of the model on the dataset. The scores is a dictionary with the following structure; dict[SplitName, dict[HFSubset, Scores]]. Where Scores is a dictionary with the following structure; dict[str, Any]. Where the keys and values are scores. Split is the split of the dataset.	required
`evaluation_time`	`float`	The time taken to evaluate the model.	required
`kg_co2_emissions`	`float \| None`	The kg of CO2 emissions produced by the model during evaluation.	`None`
`date`	`datetime \| None`	The date the model was trained on.	`None`

Source code in mteb/results/task_result.py

@classmethod
def from_task_results(
    cls,
    task: AbsTask | type[AbsTask],
    scores: dict[SplitName, Mapping[HFSubset, ScoresDict]],
    evaluation_time: float,
    kg_co2_emissions: float | None = None,
    date: datetime.datetime | None = None,
) -> TaskResult:
    """Create a TaskResult from the task and scores.

    Args:
        task: The task to create the TaskResult from.
        scores: The scores of the model on the dataset. The scores is a dictionary with the following structure; dict[SplitName, dict[HFSubset, Scores]].
            Where Scores is a dictionary with the following structure; dict[str, Any]. Where the keys and values are scores. Split is the split of
            the dataset.
        evaluation_time: The time taken to evaluate the model.
        kg_co2_emissions: The kg of CO2 emissions produced by the model during evaluation.
        date: The date the model was trained on.
    """
    task_meta = task.metadata
    subset2langscripts = task_meta.hf_subsets_to_langscripts
    mteb_ver = version("mteb")
    flat_scores = defaultdict(list)
    for split, hf_subset_scores in scores.items():
        for hf_subset, hf_scores in hf_subset_scores.items():
            if hf_subset in subset2langscripts:
                eval_langs = subset2langscripts[hf_subset]
            else:
                # For aggregated tasks, scores may use "default" subset
                # which isn't in the per-subset langscript mapping.
                # Collect all languages from the mapping.
                all_langs: list[str] = []
                for langs in subset2langscripts.values():
                    all_langs.extend(
                        lang for lang in langs if lang not in all_langs
                    )
                eval_langs = all_langs
            _scores = {
                **hf_scores,
                "hf_subset": hf_subset,
                "languages": eval_langs,
                "mteb_version": mteb_ver,
            }
            flat_scores[split].append(_scores)

    return TaskResult(
        dataset_revision=task.metadata.revision,
        task_name=task.metadata.name,
        mteb_version=mteb_ver,
        scores=flat_scores,
        evaluation_time=evaluation_time,
        kg_co2_emissions=kg_co2_emissions,
        date=date,
    )

`from_validated(**data)` `classmethod` ¶

Create a TaskResult from validated data.

Returns:

Type	Description
`TaskResult`	The created TaskResult object.

Source code in mteb/results/task_result.py

@classmethod
def from_validated(cls, **data: Any) -> TaskResult:
    """Create a TaskResult from validated data.

    Returns:
        The created TaskResult object.
    """
    return cls.model_construct(**data)

`get_hf_eval_results()` ¶

Create HF evaluation results objects from TaskResult objects.

Returns:

Type	Description
`list[EvalResult]`	List of EvalResult objects for each split and subset.

Source code in mteb/results/task_result.py

@deprecated(
    "HF deprecated `EvalResults` in favor of `Benchmarks` and it's results. "
    "To push new results use ModelMeta.push_eval_results()."
)
def get_hf_eval_results(self) -> list[EvalResult]:
    """Create HF evaluation results objects from TaskResult objects.

    Returns:
        List of EvalResult objects for each split and subset.
    """
    task_metadata = self.task.metadata
    task_type = task_metadata._hf_task_type()[0]
    results = []
    for split, scores in self.scores.items():
        for subset_results in scores:
            subset = subset_results.get("hf_subset", "default")
            results.append(
                EvalResult(
                    task_type=task_type,
                    task_name=task_metadata.type,
                    dataset_type=task_metadata.dataset["path"],
                    dataset_name=f"{task_metadata.name} ({subset})",
                    dataset_config=subset,
                    dataset_split=split,
                    dataset_revision=task_metadata.dataset["revision"],
                    metric_type=task_metadata.main_score,
                    metric_name=task_metadata.main_score,
                    metric_value=subset_results["main_score"],
                    source_name="MTEB",
                    source_url="https://github.com/embeddings-benchmark/mteb/",
                )
            )
    return results

`get_missing_evaluations(task)` ¶

Checks which splits and subsets are missing from the results.

Parameters:

Name	Type	Description	Default
`task`	`AbsTask`	The task to check against.	required

Returns:

Type	Description
`dict[str, list[str]]`	A dictionary with the splits as keys and a list of missing subsets as values.

Source code in mteb/results/task_result.py

def get_missing_evaluations(self, task: AbsTask) -> dict[str, list[str]]:
    """Checks which splits and subsets are missing from the results.

    Args:
        task: The task to check against.

    Returns:
        A dictionary with the splits as keys and a list of missing subsets as values.
    """
    missing_splits = {}
    for splits in task.eval_splits:
        if splits not in self.scores:  # split it fully missing
            missing_splits[splits] = task.hf_subsets
        if splits in self.scores:
            hf_subsets = {score["hf_subset"] for score in self.scores[splits]}
            missing_subsets = list(set(task.hf_subsets) - hf_subsets)
            if missing_subsets:
                missing_splits[splits] = missing_subsets

    return missing_splits

`get_score(splits=None, languages=None, scripts=None, getter=lambda scores: scores['main_score'], aggregation=np.mean)` ¶

Get a score for the specified splits, languages, scripts and aggregation function.

Parameters:

Name	Type	Description	Default
`splits`	`list[SplitName] \| None`	The splits to consider.	`None`
`languages`	`list[ISOLanguage \| ISOLanguageScript] \| None`	The languages to consider. Can be ISO language codes or ISO language script codes.	`None`
`scripts`	`list[ISOLanguageScript] \| None`	The scripts to consider.	`None`
`getter`	`Callable[[ScoresDict], Score]`	A function that takes a scores dictionary and returns a score e.g. "main_score" or "evaluation_time".	`lambda scores: scores['main_score']`
`aggregation`	`Callable[[list[Score]], float]`	The aggregation function to use.	`mean`

Returns:

Type	Description
`float`	The result of the aggregation function on the scores.

Source code in mteb/results/task_result.py

def get_score(
    self,
    splits: list[SplitName] | None = None,
    languages: list[ISOLanguage | ISOLanguageScript] | None = None,
    scripts: list[ISOLanguageScript] | None = None,
    getter: Callable[[ScoresDict], Score] = lambda scores: scores["main_score"],
    aggregation: Callable[[list[Score]], float] = np.mean,
) -> float:
    """Get a score for the specified splits, languages, scripts and aggregation function.

    Args:
        splits: The splits to consider.
        languages: The languages to consider. Can be ISO language codes or ISO language script codes.
        scripts: The scripts to consider.
        getter: A function that takes a scores dictionary and returns a score e.g. "main_score" or "evaluation_time".
        aggregation: The aggregation function to use.

    Returns:
        The result of the aggregation function on the scores.
    """
    if splits is None:
        splits = list(self.scores.keys())

    lang_scripts = LanguageScripts.from_languages_and_scripts(languages, scripts)

    values = []
    for split in splits:
        if split not in self.scores:
            raise ValueError(f"Split {split} not found in scores")

        for scores in self.scores[split]:
            eval_langs = scores["languages"]
            for lang in eval_langs:
                if lang_scripts.contains_language(lang):
                    values.append(getter(scores))
                    break

    return aggregation(values)

`is_mergeable(result, criteria=['dataset_revision'], raise_error=False)` ¶

Checks if the TaskResult object can be merged with another TaskResult or Task.

Parameters:

Name	Type	Description	Default
`result`	`TaskResult \| AbsTask`	The TaskResult or Task object to check against.	required
`criteria`	`list[str] \| list[Criteria]`	Additional criteria to check for merging. Can be "dataset_revision" or "mteb_version" (opt-in). It will always check that the task name match.	`['dataset_revision']`
`raise_error`	`bool`	If True, raises an error if the objects cannot be merged. If False, returns False.	`False`

Returns:

Type	Description
`bool`	True if the TaskResult object can be merged with the other object, False otherwise.

Source code in mteb/results/task_result.py

def is_mergeable(
    self,
    result: TaskResult | AbsTask,
    criteria: list[str] | list[Criteria] = [
        "dataset_revision",
    ],
    raise_error: bool = False,
) -> bool:
    """Checks if the TaskResult object can be merged with another TaskResult or Task.

    Args:
        result: The TaskResult or Task object to check against.
        criteria: Additional criteria to check for merging. Can be "dataset_revision" or "mteb_version" (opt-in).
            It will always check that the task name match.
        raise_error: If True, raises an error if the objects cannot be merged. If False, returns False.

    Returns:
        True if the TaskResult object can be merged with the other object, False otherwise.
    """
    criteria = [Criteria.from_str(c) if isinstance(c, str) else c for c in criteria]
    if isinstance(result, TaskResult):
        name = result.task_name
        revision = result.dataset_revision
        mteb_version = result.mteb_version
    elif isinstance(result, AbsTask):
        mteb_version = version("mteb")
        name = result.metadata.name
        revision = result.metadata.revision
    else:
        msg = "result must be a TaskResult or AbsTask object"
        if raise_error:
            raise ValueError(msg)
        logger.debug(msg)
        return False

    if self.task_name != name:
        msg = f"Cannot merge TaskResult objects as they are derived from different tasks ({self.task_name} and {name})"
        if raise_error:
            raise ValueError(msg)
        logger.debug(msg)
        return False

    if Criteria.MTEB_VERSION in criteria and self.mteb_version != mteb_version:
        msg = f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} (loaded) and {mteb_version} (current))"
        if raise_error:
            raise ValueError(msg)
        logger.debug(msg)
        return False

    if Criteria.DATASET_REVISION in criteria and self.dataset_revision != revision:
        msg = f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})"
        if raise_error:
            raise ValueError(msg)
        logger.debug(msg)
        return False

    return True

`merge(new_results, criteria=['dataset_revision'])` ¶

Merges two TaskResult objects.

Parameters:

Name	Type	Description	Default
`new_results`	`TaskResult`	The new TaskResult object to merge with the current one.	required
`criteria`	`list[str] \| list[Criteria]`	Additional criteria to check for merging. Can be "mteb_version" or "dataset_revision". It will always check that the task name match.	`['dataset_revision']`

Returns:

Type	Description
`TaskResult`	A new TaskResult object with the merged scores.

Source code in mteb/results/task_result.py

def merge(
    self,
    new_results: TaskResult,
    criteria: list[str] | list[Criteria] = [
        "dataset_revision",
    ],
) -> TaskResult:
    """Merges two TaskResult objects.

    Args:
        new_results: The new TaskResult object to merge with the current one.
        criteria: Additional criteria to check for merging. Can be "mteb_version" or "dataset_revision".
            It will always check that the task name match.

    Returns:
        A new TaskResult object with the merged scores.
    """
    self.is_mergeable(new_results, criteria=criteria, raise_error=True)

    merged_scores = self.scores.copy()

    for split, scores in new_results.scores.items():
        if split in merged_scores:
            merged_scores[split] = self._merge_split_scores(
                merged_scores[split], scores
            )
        else:
            merged_scores[split] = scores

    existing_kg_co2_emissions = (
        self.kg_co2_emissions if self.kg_co2_emissions else 0
    )
    new_kg_co2_emissions = (
        new_results.kg_co2_emissions if new_results.kg_co2_emissions else 0
    )
    merged_kg_co2_emissions = None
    if existing_kg_co2_emissions and new_kg_co2_emissions:
        merged_kg_co2_emissions = existing_kg_co2_emissions + new_kg_co2_emissions

    merged_evaluation_time = None
    if self.evaluation_time and new_results.evaluation_time:
        merged_evaluation_time = self.evaluation_time + new_results.evaluation_time
    date = self.date
    if new_results.date is not None and (date is None or new_results.date > date):
        date = new_results.date
    mteb_ver = self._compute_top_level_mteb_version(merged_scores)

    merged_results = TaskResult(
        dataset_revision=new_results.dataset_revision,
        task_name=new_results.task_name,
        mteb_version=mteb_ver,
        scores=merged_scores,
        evaluation_time=merged_evaluation_time,
        kg_co2_emissions=merged_kg_co2_emissions,
        date=date,
    )

    return merged_results

`only_main_score()` ¶

Return a new TaskResult object with only the main score.

Returns:

Type	Description
`TaskResult`	A new TaskResult object with only the main score.

Source code in mteb/results/task_result.py

def only_main_score(self) -> TaskResult:
    """Return a new TaskResult object with only the main score.

    Returns:
        A new TaskResult object with only the main score.
    """
    new_scores: dict[str, list[Score]] = {}
    for split in self.scores:
        new_scores[split] = []
        for subset_scores in self.scores[split]:
            new_scores[split].append(
                {
                    "hf_subset": subset_scores.get("hf_subset", "default"),
                    "main_score": subset_scores.get("main_score", np.nan),
                    "languages": subset_scores.get("languages", []),
                }
            )
    new_res = {**self.to_dict(), "scores": new_scores}
    return TaskResult.from_validated(**new_res)

`to_dict()` ¶

Convert the TaskResult to a dictionary.

Returns:

Type	Description
`dict[str, Any]`	The TaskResult as a dictionary.

Source code in mteb/results/task_result.py

def to_dict(self) -> dict[str, Any]:
    """Convert the TaskResult to a dictionary.

    Returns:
        The TaskResult as a dictionary.
    """
    return self.model_dump()

`to_disk(path)` ¶

Save TaskResult to disk.

Parameters:

Name	Type	Description	Default
`path`	`Path`	The path to the file to save.	required

Source code in mteb/results/task_result.py

def to_disk(self, path: Path) -> None:
    """Save TaskResult to disk.

    Args:
        path: The path to the file to save.
    """
    json_obj = self.model_dump()
    json_obj["date"] = self.date.timestamp() if self.date else None
    self._round_scores(json_obj["scores"], 6)

    with path.open("w") as f:  # noqa: PLW1514
        json.dump(json_obj, f, indent=2)

`validate_and_filter_scores(task=None)` ¶

Validate and filter the scores against the task metadata.

This ensures that the scores are correct for the given task, by removing any splits besides those specified in the task metadata. Additionally it also ensure that all of the splits required as well as the languages are present in the scores. Returns new TaskResult object.

Parameters:

Name	Type	Description	Default
`task`	`AbsTask \| None`	The task to validate the scores against. E.g. if the task supplied is limited to certain splits and languages, the scores will be filtered to only include those splits and languages. If None it will attempt to get the task from the task_name.	`None`

Returns:

Type	Description
`TaskResult`	A new TaskResult object with the validated and filtered scores.

Source code in mteb/results/task_result.py

def validate_and_filter_scores(
    self,
    task: AbsTask | None = None,
) -> TaskResult:
    """Validate and filter the scores against the task metadata.

    This ensures that the scores are correct for the given task, by removing any splits besides those specified in the task metadata.
    Additionally it also ensure that all of the splits required as well as the languages are present in the scores.
    Returns new TaskResult object.

    Args:
        task: The task to validate the scores against. E.g. if the task supplied is limited to certain splits and languages,
            the scores will be filtered to only include those splits and languages. If None it will attempt to get the task from the task_name.

    Returns:
        A new TaskResult object with the validated and filtered scores.
    """
    from mteb.get_tasks import get_task

    if task is None:
        task = get_task(self.task_name)

    splits = task.eval_splits
    hf_subsets = set(task.hf_subsets)  # Convert to set once

    new_scores: dict[str, list[Score]] = {}
    seen_splits = set()
    for split in self.scores:
        if split not in splits:
            continue
        seen_subsets = set()
        if task.is_aggregate:
            # aggregate tasks only have the default subset, but in metadata can be multiple
            new_scores[split] = [
                _scores
                for _scores in self.scores[split]
                if _scores["hf_subset"] == "default"
            ]
            seen_subsets = {"default"}
        else:
            new_scores[split] = [
                _scores
                for _scores in self.scores[split]
                if _scores["hf_subset"] in hf_subsets
            ]
        for _scores in new_scores[split]:
            seen_subsets.add(_scores["hf_subset"])

        if seen_subsets != hf_subsets and not (
            task.is_aggregate and "default" in seen_subsets
        ):
            missing_subsets = hf_subsets - seen_subsets
            if len(missing_subsets) > 2:
                subset1, subset2 = list(missing_subsets)[:2]
                missing_subsets_str = f"{{'{subset1}', '{subset2}', ...}}"
            else:
                missing_subsets_str = str(missing_subsets)

            log_once.warning(
                f"{task.metadata.name}: Missing subsets {missing_subsets_str} for split {split}"
            )
            for missing_subset in missing_subsets:
                new_scores[split].append(
                    {
                        "hf_subset": missing_subset,
                        "main_score": np.nan,
                        "languages": task.metadata.hf_subsets_to_langscripts.get(
                            missing_subset, []
                        ),
                    }
                )
        seen_splits.add(split)
    if seen_splits != set(splits):
        log_once.warning(
            f"{task.metadata.name}: Missing splits {set(splits) - seen_splits}"
        )
        for missing_split in set(splits) - seen_splits:
            new_scores[missing_split] = []
            for missing_subset in hf_subsets:
                new_scores[missing_split].append(
                    {
                        "hf_subset": missing_subset,
                        "main_score": np.nan,
                        "languages": task.metadata.hf_subsets_to_langscripts.get(
                            missing_subset, []
                        ),
                    }
                )
    data = self.model_dump()
    data["scores"] = new_scores
    return type(self).model_construct(**data)

`mteb.results.ModelResult` ¶

Bases: BaseModel

Data class to hold the results of a model on a set of tasks.

Attributes:

Name	Type	Description
`model_name`	`str`	Name of the model.
`model_revision`	`str \| None`	Revision of the model.
`task_results`	`list[TaskResult]`	List of TaskResult objects.

Source code in mteb/results/model_result.py

class ModelResult(BaseModel):
    """Data class to hold the results of a model on a set of tasks.

    Attributes:
        model_name: Name of the model.
        model_revision: Revision of the model.
        task_results: List of TaskResult objects.
    """

    model_name: str
    model_revision: str | None
    task_results: list[TaskResult]
    model_config = (
        ConfigDict(  # to free up the name model_* which is otherwise protected
            protected_namespaces=(),
        )
    )
    exceptions: list[TaskError] | None = None
    experiment_name: str | None = None

    def __repr__(self) -> str:
        n_entries = len(self.task_results)
        return (
            f"ModelResult(model_name='{self.model_name}', model_revision='{self.model_revision}', "
            f"{'experiment_name=' + self.experiment_name + ', ' if self.experiment_name else ''}"
            f"task_results=[...](#{n_entries}), ...)"
        )

    @classmethod
    def from_validated(cls, **data: dict[str, Any]) -> ModelResult:
        """Create a ModelResult from validated data.

        Args:
            data: The validated data.
        """
        data["task_results"] = [  # type: ignore[assignment]
            TaskResult.from_validated(**res)  # type: ignore[arg-type]
            for res in data["task_results"]
        ]
        return cls.model_construct(**data)  # type: ignore[arg-type]

    def _filter_tasks(
        self,
        task_names: list[str] | None = None,
        *,
        languages: list[str] | None = None,
        domains: list[TaskDomain] | None = None,
        task_types: list[TaskType] | None = None,
        modalities: list[Modalities] | None = None,
        is_public: bool | None = None,
    ) -> ModelResult:
        new_task_results = []
        for task_result in self.task_results:
            if (task_names is not None) and (task_result.task_name not in task_names):
                continue
            if languages is not None:
                task_languages = task_result.languages
                if not any(lang in task_languages for lang in languages):
                    continue
            if domains is not None:
                task_domains = task_result.domains
                if not any(domain in task_domains for domain in domains):
                    continue
            if (task_types is not None) and (task_result.task_type not in task_types):
                continue
            if modalities is not None:
                task_modalities = getattr(task_result, "modalities", [])
                if not any(modality in task_modalities for modality in modalities):
                    continue
            if (is_public is not None) and (task_result.is_public is not is_public):
                continue
            new_task_results.append(task_result)
        return type(self).model_construct(
            model_name=self.model_name,
            model_revision=self.model_revision,
            task_results=new_task_results,
            experiment_name=self.experiment_name,
        )

    def select_tasks(self, tasks: Iterable[AbsTask]) -> ModelResult:
        """Select tasks from the ModelResult based on a list of AbsTask objects.

        Args:
            tasks: A sequence of AbsTask objects to select from the ModelResult.
        """
        task_name_to_task = {task.metadata.name: task for task in tasks}
        new_task_results = [
            task_res.validate_and_filter_scores(task_name_to_task[task_res.task_name])
            for task_res in self.task_results
            if task_res.task_name in task_name_to_task
        ]
        return type(self).model_construct(
            model_name=self.model_name,
            model_revision=self.model_revision,
            task_results=new_task_results,
            experiment_name=self.experiment_name,
        )

    @overload
    def _get_scores(
        self,
        *,
        splits: list[SplitName] | None = None,
        languages: list[ISOLanguage | ISOLanguageScript] | None = None,
        scripts: list[ISOLanguageScript] | None = None,
        getter: Callable[[ScoresDict], Score] | None = None,
        aggregation: Callable[[list[Score]], Any] | None = None,
        format: Literal["wide"] = "wide",
    ) -> dict[str, float]: ...

    @overload
    def _get_scores(
        self,
        *,
        splits: list[SplitName] | None = None,
        languages: list[ISOLanguage | ISOLanguageScript] | None = None,
        scripts: list[ISOLanguageScript] | None = None,
        getter: Callable[[ScoresDict], Score] | None = None,
        aggregation: Callable[[list[Score]], Any] | None = None,
        format: Literal["long"] = "long",
    ) -> list[dict[str, str | float | None]]: ...

    def _get_scores(
        self,
        *,
        splits: list[SplitName] | None = None,
        languages: list[ISOLanguage | ISOLanguageScript] | None = None,
        scripts: list[ISOLanguageScript] | None = None,
        getter: Callable[[ScoresDict], Score] | None = None,
        aggregation: Callable[[list[Score]], Any] | None = None,
        format: Literal["wide", "long"] = "wide",
    ) -> dict[str, float] | list[dict[str, str | float | None]]:
        if (getter is not None) or (aggregation is not None) or (scripts is not None):
            use_fast = False
            getter = (
                getter if getter is not None else lambda scores: scores["main_score"]
            )
            aggregation = aggregation if aggregation is not None else np.mean
        else:
            use_fast = True
        aggregation = cast("Callable[[list[Score]], Any]", aggregation)
        getter = cast("Callable[[ScoresDict], Score]", getter)

        if format == "wide":
            scores = {}
            for res in self.task_results:
                try:
                    if use_fast:
                        scores[res.task_name] = res._get_score_fast(
                            splits=splits,
                            languages=languages,
                        )
                    else:
                        scores[res.task_name] = res.get_score(
                            splits=splits,
                            languages=languages,
                            aggregation=aggregation,
                            getter=getter,
                            scripts=scripts,
                        )
                except Exception as e:
                    warnings.warn(
                        f"Couldn't get scores for {res.task_name} due to {e}."
                    )
            return scores
        if format == "long":
            entries = []
            for task_res in self.task_results:
                try:
                    if use_fast:
                        score = task_res._get_score_fast(
                            splits=splits,
                            languages=languages,
                        )
                    else:
                        score = task_res.get_score(
                            splits=splits,
                            languages=languages,
                            aggregation=aggregation,
                            getter=getter,
                            scripts=scripts,
                        )
                    entry = dict(
                        model_name=self.model_name,
                        model_revision=self.model_revision,
                        task_name=task_res.task_name,
                        score=score,
                        mteb_version=task_res.mteb_version,
                        dataset_revision=task_res.dataset_revision,
                        evaluation_time=task_res.evaluation_time,
                        kg_co2_emissions=task_res.kg_co2_emissions,
                    )
                    entries.append(entry)
                except Exception as e:
                    warnings.warn(
                        f"Couldn't get scores for {task_res.task_name} due to {e}."
                    )
            return entries

    def _get_score_for_table(self) -> list[dict[str, str | float | list[str]]]:
        scores_data = []
        model_name = self.model_name
        for task_result in self.task_results:
            task_name = task_result.task_name
            for split, scores_list in task_result.scores.items():
                for score_item in scores_list:
                    row = {
                        "model_name": model_name,
                        "model_revision": self.model_revision,
                        "task_name": task_name,
                        "split": split,
                        "language": score_item.get("languages", ["Unknown"]),
                        "subset": score_item.get("hf_subset", "default"),
                        "score": score_item.get("main_score", None),
                    }
                    scores_data.append(row)

        return scores_data

    def to_dataframe(
        self,
        aggregation_level: Literal["subset", "split", "task"] = "task",
        aggregation_fn: Callable[[list[Score]], Any] | str | None = None,
        include_model_revision: bool = False,
        format: Literal["wide", "long"] = "wide",
    ) -> pd.DataFrame:
        """Get a DataFrame with the scores for all models and tasks.

        The DataFrame will have the following columns in addition to the metadata columns:

        - model_name: The name of the model.
        - task_name: The name of the task.
        - score: The main score of the model on the task.

        In addition, the DataFrame can have the following columns depending on the aggregation level:

        - split: The split of the task. E.g. "test", "train", "validation".
        - subset: The subset of the task. E.g. "en", "fr-en".

        Afterwards, the DataFrame will be aggregated according to the aggregation method and pivoted to either a wide format.

        Args:
            aggregation_level: The aggregation to use. Can be one of:
                - "subset"/None: No aggregation will be done. The DataFrame will have one row per model, task, split and subset.
                - "split": Aggregates the scores by split. The DataFrame will have one row per model, task and split.
                - "task": Aggregates the scores by task. The DataFrame will have one row per model and task.
            aggregation_fn: The function to use for aggregation. If None, the mean will be used.
            include_model_revision: If True, the model revision will be included in the DataFrame. If False, it will be excluded.
            format: The format of the DataFrame. Can be one of:
                - "wide": The DataFrame will be of shape (number of tasks, number of models). Scores will be in the cells.
                - "long": The DataFrame will of length (number of tasks * number of model). Scores will be in columns.

        Returns:
            A DataFrame with the scores for all models and tasks.
        """
        scores_data = self._get_score_for_table()

        if not scores_data:
            msg = "No scores data available. Returning empty DataFrame."
            logger.warning(msg)
            warnings.warn(msg)
            return pd.DataFrame()

        # Create DataFrame
        df = pd.DataFrame(scores_data)

        _columns = ["model_name"]
        if include_model_revision is False:
            df = df.drop(columns=["model_revision"])
        else:
            _columns.append("model_revision")

        return _aggregate_and_pivot(
            df,
            columns=_columns,
            aggregation_level=aggregation_level,
            format=format,
            aggregation_fn=aggregation_fn,
        )

    def __hash__(self) -> int:
        return id(self)

    def __iter__(self) -> Iterable[TaskResult]:  # type: ignore[override]
        return iter(self.task_results)

    def __getitem__(self, index: int) -> TaskResult:
        return self.task_results[index]

    def __len__(self) -> int:
        return len(self.task_results)

    @property
    def languages(self) -> list[str]:
        """Get all languages in the model results.

        Returns:
            A list of languages in the model results.
        """
        langs = []
        for task_res in self.task_results:
            langs.extend(task_res.languages)
        return list(set(langs))

    @property
    def domains(self) -> list[str]:
        """Get all domains in the model results.

        Returns:
            A list of domains in the model results.

        """
        ds = []
        for task_res in self.task_results:
            ds.extend(task_res.domains)
        return list(set(ds))

    @property
    def task_types(self) -> list[str]:
        """Get all task types in the model results.

        Returns:
            A list of task types in the model results.
        """
        return list({task_res.task_type for task_res in self.task_results})

    @property
    def task_names(self) -> list[str]:
        """Get all task names in the model results.

        Returns:
            A list of task names in the model results.
        """
        return [task_res.task_name for task_res in self.task_results]

    @property
    def modalities(self) -> list[Modalities]:
        """Get all modalities in the task results.

        Returns:
            A list of modalities in the task results.
        """
        mods: list[Modalities] = []
        for task_res in self.task_results:
            task_modalities = getattr(task_res, "modalities", [])
            mods.extend(task_modalities)
        if not mods:
            mods = ["text"]
        return list(set(mods))

    def to_disk(self, path: Path) -> None:
        """Save ModelResult to disk as JSON.

        Args:
            path: The path to the file to save.
        """
        with path.open("w") as f:  # noqa: PLW1514
            f.write(self.model_dump_json(indent=2))

    @classmethod
    def from_disk(cls, path: Path) -> ModelResult:
        """Load ModelResult from disk.

        Args:
            path: The path to the JSON file to load.

        Returns:
            The loaded ModelResult object.
        """
        with path.open("r", encoding="utf-8") as f:
            return cls.model_validate_json(f.read())

    def push_model_results(
        self,
        user: str | None = None,
        *,
        benchmark: Benchmark | Sequence[Benchmark] | None = None,
        create_pr: bool = False,
        raise_error: bool = False,
    ) -> None:
        """Push the model results to the Hugging Face Hub.

        Args:
            user: The user or organization of results source.
            benchmark: Whether to push the benchmark results.
            create_pr: Whether to create a pull request
            raise_error: Whether to push results if model have missing scores.
        """
        benchmark_results: None | list[HFEvalResult] = None
        benchmarks: None | Sequence[Benchmark] = None
        if benchmark is not None:
            benchmark_results = []
            benchmarks = [benchmark] if isinstance(benchmark, Benchmark) else benchmark
            for cur_benchmark in benchmarks:
                try:
                    benchmark_score = cur_benchmark._get_model_score(self)["Mean(Task)"]
                except ValueError:
                    if raise_error:
                        raise
                    logger.warning(
                        f"Model {self.model_name} have missing scores on {cur_benchmark.name}. Skipping it"
                    )
                    benchmark_score = None
                benchmark_results.append(
                    HFEvalResult(
                        dataset=HFEvalResultDataset(
                            id=cur_benchmark.benchmark_hf_repo,
                            task_id=cur_benchmark.name,
                            revision="1",
                        ),
                        value=benchmark_score,
                        date=None,
                        notes="Obtained using MTEB",
                        source=HFEvalResultSource(
                            url="https://github.com/embeddings-benchmark/mteb/",
                            user=user,
                            name="Obtained using MTEB",
                        ),
                    )
                )
        with tempfile.TemporaryDirectory() as tmpdir:
            path = Path(tmpdir)
            for task_result in self.task_results:
                task_results = task_result._to_hf_benchmark_result(user)
                with (path / f"{task_result.task_name}.yaml").open(
                    "w", encoding="utf-8"
                ) as f:
                    f.write(task_results.to_yaml())

            if benchmark_results is not None and benchmarks is not None:
                for cur_benchmark, benchmark_result in zip(
                    benchmarks, benchmark_results
                ):
                    if cur_benchmark.name is None:
                        raise ValueError(
                            f"Benchmark {cur_benchmark} doesn't have name."
                        )
                    with (path / f"{cur_benchmark.name}.yaml").open(
                        "w", encoding="utf-8"
                    ) as f:
                        f.write(
                            HFEvalResults.model_validate([benchmark_result]).to_yaml()
                        )

            huggingface_hub.upload_folder(
                repo_id=self.model_name,
                repo_type="model",
                path_in_repo=".eval_results",
                folder_path=path,
                commit_message=f"Add evaluation results for model {self.model_name} revision {self.model_revision}",
                create_pr=create_pr,
            )

`domains` `property` ¶

Get all domains in the model results.

Returns:

Type	Description
`list[str]`	A list of domains in the model results.

`languages` `property` ¶

Get all languages in the model results.

Returns:

Type	Description
`list[str]`	A list of languages in the model results.

`modalities` `property` ¶

Get all modalities in the task results.

Returns:

Type	Description
`list[Modalities]`	A list of modalities in the task results.

`task_names` `property` ¶

Get all task names in the model results.

Returns:

Type	Description
`list[str]`	A list of task names in the model results.

`task_types` `property` ¶

Get all task types in the model results.

Returns:

Type	Description
`list[str]`	A list of task types in the model results.

`from_disk(path)` `classmethod` ¶

Load ModelResult from disk.

Parameters:

Name	Type	Description	Default
`path`	`Path`	The path to the JSON file to load.	required

Returns:

Type	Description
`ModelResult`	The loaded ModelResult object.

Source code in mteb/results/model_result.py

@classmethod
def from_disk(cls, path: Path) -> ModelResult:
    """Load ModelResult from disk.

    Args:
        path: The path to the JSON file to load.

    Returns:
        The loaded ModelResult object.
    """
    with path.open("r", encoding="utf-8") as f:
        return cls.model_validate_json(f.read())

`from_validated(**data)` `classmethod` ¶

Create a ModelResult from validated data.

Parameters:

Name	Type	Description	Default
`data`	`dict[str, Any]`	The validated data.	`{}`

Source code in mteb/results/model_result.py

@classmethod
def from_validated(cls, **data: dict[str, Any]) -> ModelResult:
    """Create a ModelResult from validated data.

    Args:
        data: The validated data.
    """
    data["task_results"] = [  # type: ignore[assignment]
        TaskResult.from_validated(**res)  # type: ignore[arg-type]
        for res in data["task_results"]
    ]
    return cls.model_construct(**data)  # type: ignore[arg-type]

`push_model_results(user=None, *, benchmark=None, create_pr=False, raise_error=False)` ¶

Push the model results to the Hugging Face Hub.

Parameters:

Name	Type	Description	Default
`user`	`str \| None`	The user or organization of results source.	`None`
`benchmark`	`Benchmark \| Sequence[Benchmark] \| None`	Whether to push the benchmark results.	`None`
`create_pr`	`bool`	Whether to create a pull request	`False`
`raise_error`	`bool`	Whether to push results if model have missing scores.	`False`

Source code in mteb/results/model_result.py

def push_model_results(
    self,
    user: str | None = None,
    *,
    benchmark: Benchmark | Sequence[Benchmark] | None = None,
    create_pr: bool = False,
    raise_error: bool = False,
) -> None:
    """Push the model results to the Hugging Face Hub.

    Args:
        user: The user or organization of results source.
        benchmark: Whether to push the benchmark results.
        create_pr: Whether to create a pull request
        raise_error: Whether to push results if model have missing scores.
    """
    benchmark_results: None | list[HFEvalResult] = None
    benchmarks: None | Sequence[Benchmark] = None
    if benchmark is not None:
        benchmark_results = []
        benchmarks = [benchmark] if isinstance(benchmark, Benchmark) else benchmark
        for cur_benchmark in benchmarks:
            try:
                benchmark_score = cur_benchmark._get_model_score(self)["Mean(Task)"]
            except ValueError:
                if raise_error:
                    raise
                logger.warning(
                    f"Model {self.model_name} have missing scores on {cur_benchmark.name}. Skipping it"
                )
                benchmark_score = None
            benchmark_results.append(
                HFEvalResult(
                    dataset=HFEvalResultDataset(
                        id=cur_benchmark.benchmark_hf_repo,
                        task_id=cur_benchmark.name,
                        revision="1",
                    ),
                    value=benchmark_score,
                    date=None,
                    notes="Obtained using MTEB",
                    source=HFEvalResultSource(
                        url="https://github.com/embeddings-benchmark/mteb/",
                        user=user,
                        name="Obtained using MTEB",
                    ),
                )
            )
    with tempfile.TemporaryDirectory() as tmpdir:
        path = Path(tmpdir)
        for task_result in self.task_results:
            task_results = task_result._to_hf_benchmark_result(user)
            with (path / f"{task_result.task_name}.yaml").open(
                "w", encoding="utf-8"
            ) as f:
                f.write(task_results.to_yaml())

        if benchmark_results is not None and benchmarks is not None:
            for cur_benchmark, benchmark_result in zip(
                benchmarks, benchmark_results
            ):
                if cur_benchmark.name is None:
                    raise ValueError(
                        f"Benchmark {cur_benchmark} doesn't have name."
                    )
                with (path / f"{cur_benchmark.name}.yaml").open(
                    "w", encoding="utf-8"
                ) as f:
                    f.write(
                        HFEvalResults.model_validate([benchmark_result]).to_yaml()
                    )

        huggingface_hub.upload_folder(
            repo_id=self.model_name,
            repo_type="model",
            path_in_repo=".eval_results",
            folder_path=path,
            commit_message=f"Add evaluation results for model {self.model_name} revision {self.model_revision}",
            create_pr=create_pr,
        )

`select_tasks(tasks)` ¶

Select tasks from the ModelResult based on a list of AbsTask objects.

Parameters:

Name	Type	Description	Default
`tasks`	`Iterable[AbsTask]`	A sequence of AbsTask objects to select from the ModelResult.	required

Source code in mteb/results/model_result.py

def select_tasks(self, tasks: Iterable[AbsTask]) -> ModelResult:
    """Select tasks from the ModelResult based on a list of AbsTask objects.

    Args:
        tasks: A sequence of AbsTask objects to select from the ModelResult.
    """
    task_name_to_task = {task.metadata.name: task for task in tasks}
    new_task_results = [
        task_res.validate_and_filter_scores(task_name_to_task[task_res.task_name])
        for task_res in self.task_results
        if task_res.task_name in task_name_to_task
    ]
    return type(self).model_construct(
        model_name=self.model_name,
        model_revision=self.model_revision,
        task_results=new_task_results,
        experiment_name=self.experiment_name,
    )

`to_dataframe(aggregation_level='task', aggregation_fn=None, include_model_revision=False, format='wide')` ¶

Get a DataFrame with the scores for all models and tasks.

The DataFrame will have the following columns in addition to the metadata columns:

model_name: The name of the model.
task_name: The name of the task.
score: The main score of the model on the task.

In addition, the DataFrame can have the following columns depending on the aggregation level:

split: The split of the task. E.g. "test", "train", "validation".
subset: The subset of the task. E.g. "en", "fr-en".

Afterwards, the DataFrame will be aggregated according to the aggregation method and pivoted to either a wide format.

Parameters:

Name	Type	Description	Default
`aggregation_level`	`Literal['subset', 'split', 'task']`	The aggregation to use. Can be one of: - "subset"/None: No aggregation will be done. The DataFrame will have one row per model, task, split and subset. - "split": Aggregates the scores by split. The DataFrame will have one row per model, task and split. - "task": Aggregates the scores by task. The DataFrame will have one row per model and task.	`'task'`
`aggregation_fn`	`Callable[[list[Score]], Any] \| str \| None`	The function to use for aggregation. If None, the mean will be used.	`None`
`include_model_revision`	`bool`	If True, the model revision will be included in the DataFrame. If False, it will be excluded.	`False`
`format`	`Literal['wide', 'long']`	The format of the DataFrame. Can be one of: - "wide": The DataFrame will be of shape (number of tasks, number of models). Scores will be in the cells. - "long": The DataFrame will of length (number of tasks * number of model). Scores will be in columns.	`'wide'`

Returns:

Type	Description
`DataFrame`	A DataFrame with the scores for all models and tasks.

Source code in mteb/results/model_result.py

def to_dataframe(
    self,
    aggregation_level: Literal["subset", "split", "task"] = "task",
    aggregation_fn: Callable[[list[Score]], Any] | str | None = None,
    include_model_revision: bool = False,
    format: Literal["wide", "long"] = "wide",
) -> pd.DataFrame:
    """Get a DataFrame with the scores for all models and tasks.

    The DataFrame will have the following columns in addition to the metadata columns:

    - model_name: The name of the model.
    - task_name: The name of the task.
    - score: The main score of the model on the task.

    In addition, the DataFrame can have the following columns depending on the aggregation level:

    - split: The split of the task. E.g. "test", "train", "validation".
    - subset: The subset of the task. E.g. "en", "fr-en".

    Afterwards, the DataFrame will be aggregated according to the aggregation method and pivoted to either a wide format.

    Args:
        aggregation_level: The aggregation to use. Can be one of:
            - "subset"/None: No aggregation will be done. The DataFrame will have one row per model, task, split and subset.
            - "split": Aggregates the scores by split. The DataFrame will have one row per model, task and split.
            - "task": Aggregates the scores by task. The DataFrame will have one row per model and task.
        aggregation_fn: The function to use for aggregation. If None, the mean will be used.
        include_model_revision: If True, the model revision will be included in the DataFrame. If False, it will be excluded.
        format: The format of the DataFrame. Can be one of:
            - "wide": The DataFrame will be of shape (number of tasks, number of models). Scores will be in the cells.
            - "long": The DataFrame will of length (number of tasks * number of model). Scores will be in columns.

    Returns:
        A DataFrame with the scores for all models and tasks.
    """
    scores_data = self._get_score_for_table()

    if not scores_data:
        msg = "No scores data available. Returning empty DataFrame."
        logger.warning(msg)
        warnings.warn(msg)
        return pd.DataFrame()

    # Create DataFrame
    df = pd.DataFrame(scores_data)

    _columns = ["model_name"]
    if include_model_revision is False:
        df = df.drop(columns=["model_revision"])
    else:
        _columns.append("model_revision")

    return _aggregate_and_pivot(
        df,
        columns=_columns,
        aggregation_level=aggregation_level,
        format=format,
        aggregation_fn=aggregation_fn,
    )

`to_disk(path)` ¶

Save ModelResult to disk as JSON.

Parameters:

Name	Type	Description	Default
`path`	`Path`	The path to the file to save.	required

Source code in mteb/results/model_result.py

def to_disk(self, path: Path) -> None:
    """Save ModelResult to disk as JSON.

    Args:
        path: The path to the file to save.
    """
    with path.open("w") as f:  # noqa: PLW1514
        f.write(self.model_dump_json(indent=2))

`mteb.results.BenchmarkResults` ¶

Bases: BaseModel

Data class to hold the benchmark results of a model.

Attributes:

Name	Type	Description
`model_results`	`list[ModelResult]`	List of ModelResult objects.

Source code in mteb/results/benchmark_results.py

class BenchmarkResults(BaseModel):  # noqa: PLR0904
    """Data class to hold the benchmark results of a model.

    Attributes:
        model_results: List of ModelResult objects.
    """

    model_results: list[ModelResult]
    benchmark: Benchmark | Sequence[Benchmark] | None = None
    model_config = ConfigDict(
        protected_namespaces=(),  # to free up the name model_results which is otherwise protected
        arbitrary_types_allowed=True,  # Benchmark is dataclasses.dataclass
    )

    def __repr__(self) -> str:
        n_models = len(self.model_results)
        return f"BenchmarkResults(model_results=[...](#{n_models}))"

    def __hash__(self) -> int:
        return id(self)

    def _filter_tasks(
        self,
        task_names: list[str] | None = None,
        *,
        languages: list[str] | None = None,
        domains: list[TaskDomain] | None = None,
        task_types: list[TaskType] | None = None,
        modalities: list[Modalities] | None = None,
        is_public: bool | None = None,
    ) -> BenchmarkResults:
        # TODO: Same as filter_models
        model_results = [
            res._filter_tasks(
                task_names=task_names,
                languages=languages,
                domains=domains,
                task_types=task_types,
                modalities=modalities,
                is_public=is_public,
            )
            for res in self.model_results
        ]
        return type(self).model_construct(
            model_results=[res for res in model_results if res.task_results]
        )

    def select_tasks(self, tasks: Iterable[AbsTask]) -> BenchmarkResults:
        """Select tasks from the benchmark results.

        Args:
            tasks: List of tasks to select. Can be a list of AbsTask objects or task names.

        Returns:
            A new BenchmarkResults object with the selected tasks.
        """
        new_model_results = [
            model_res.select_tasks(tasks) for model_res in self.model_results
        ]
        return type(self).model_construct(model_results=new_model_results)

    def select_models(
        self,
        names: list[str] | list[ModelMeta],
        revisions: list[str | None] | None = None,
    ) -> BenchmarkResults:
        """Get models by name and revision.

        Args:
            names: List of model names to filter by. Can also be a list of ModelMeta objects. In which case, the revision is ignored.
            revisions: List of model revisions to filter by. If None, all revisions are returned.

        Returns:
            A new BenchmarkResults object with the filtered models.
        """
        models_res = []
        _revisions = revisions if revisions is not None else [None] * len(names)

        name_rev: dict[str, str | None] = {}

        if len(names) != len(_revisions):
            raise ValueError(
                "The length of names and revisions must be the same or revisions must be None."
            )

        for name, revision in zip(names, _revisions):
            if isinstance(name, ModelMeta):
                if name.name is None:
                    raise ValueError("name in ModelMeta is None. It must be a string.")
                name_rev[name.name] = name.revision
            else:
                name_ = cast("str", name)
                name_rev[name_] = revision

        for model_res in self.model_results:
            model_name = model_res.model_name
            revision = model_res.model_revision
            if model_name in name_rev:
                if name_rev[model_name] is None or revision == name_rev[model_name]:
                    models_res.append(model_res)

        return type(self).model_construct(model_results=models_res)

    def _filter_models(
        self,
        model_names: Iterable[str] | None = None,
        *,
        languages: Iterable[str] | None = None,
        open_weights: bool | None = None,
        frameworks: Iterable[str] | None = None,
        n_parameters_range: tuple[int | None, int | None] = (None, None),
        use_instructions: bool | None = None,
        zero_shot_on: list[AbsTask] | None = None,
    ) -> BenchmarkResults:
        # mostly a utility function for the leaderboard app.
        # I would probably move the filtering of the models outside of this call. No need to call get_model_metas inside the filter.
        # interface would then be the same as the get_models function

        model_metas = get_model_metas(
            model_names=model_names,
            languages=languages,
            open_weights=open_weights,
            frameworks=frameworks,
            n_parameters_range=n_parameters_range,
            use_instructions=use_instructions,
            zero_shot_on=zero_shot_on,
        )
        models = {meta.name for meta in model_metas}
        # model_revision_pairs = {(meta.name, meta.revision) for meta in model_metas}
        new_model_results = []
        for model_res in self:
            if model_res.model_name in models:
                new_model_results.append(model_res)

        return type(self).model_construct(model_results=new_model_results)

    def join_revisions(self) -> BenchmarkResults:
        """Join revisions of the same model.

        In case of conflicts, the following rules are applied:
        1) If the main revision is present, it is kept. The main revision is the defined in the models ModelMeta object.
        2) If there is multiple revisions and some of them are None or na, they are filtered out.
        3) If there is no main revision, we prefer the one run using the latest mteb version.

        Returns:
            A new BenchmarkResults object with the revisions joined.
        """
        records = []
        for model_result in self:
            for task_result in model_result.task_results:
                records.append(
                    dict(
                        model=model_result.model_name,
                        revision=model_result.model_revision,
                        task_name=task_result.task_name,
                        mteb_version=task_result.mteb_version,
                        task_result=task_result,
                        has_scores=bool(task_result.scores),
                    )
                )
        if not records:
            return BenchmarkResults.model_construct(model_results=[])
        task_df = pd.DataFrame.from_records(records)

        # Use cached model metas
        model_to_main_revision = _get_cached_model_metas()
        task_df["main_revision"] = task_df["model"].map(model_to_main_revision)

        # Use cached version parsing
        task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached)

        # Filter out rows without scores first
        task_df = task_df[task_df["has_scores"]]

        # Optimize groupby with vectorized operations
        # Sort by priority: main_revision match, then mteb_version (descending), then revision
        task_df["is_main_revision"] = task_df["revision"] == task_df["main_revision"]

        # Handle None/NA/external revisions
        task_df["revision_clean"] = task_df["revision"].copy()
        task_df.loc[task_df["revision"].isna(), "revision_clean"] = (
            "no_revision_available"
        )
        task_df.loc[task_df["revision"] == "external", "revision_clean"] = (
            "no_revision_available"
        )

        # Create a priority column for sorting
        # Higher priority = better to keep
        # Priority: main_revision (1000), has valid mteb_version (100), has valid revision (10)
        task_df["priority"] = 0
        task_df.loc[task_df["is_main_revision"], "priority"] += 1000
        task_df.loc[task_df["mteb_version"].notna(), "priority"] += 100
        task_df.loc[
            task_df["revision_clean"] != "no_revision_available", "priority"
        ] += 10

        # Sort by priority (desc), mteb_version (desc), and take first per group
        task_df = task_df.sort_values(
            ["model", "task_name", "priority", "mteb_version"],
            ascending=[True, True, False, False],
            na_position="last",
        )

        task_df = task_df.groupby(["model", "task_name"], as_index=False).first()

        # Reconstruct model results
        model_results = []
        # Group by original revision to maintain deterministic behavior
        # After the first() selection above, each (model, task_name) is unique,
        # so grouping by original revision ensures consistent ModelResult creation
        for (model, model_revision), group in task_df.groupby(["model", "revision"]):
            model_result = ModelResult.model_construct(
                model_name=model,  # type: ignore[arg-type]
                model_revision=model_revision,  # type: ignore[arg-type]
                task_results=list(group["task_result"]),
            )
            model_results.append(model_result)
        return BenchmarkResults.model_construct(model_results=model_results)

    def _get_scores(
        self,
        *,
        splits: list[SplitName] | None = None,
        languages: list[ISOLanguage | ISOLanguageScript] | None = None,
        scripts: list[ISOLanguageScript] | None = None,
        getter: Callable[[ScoresDict], Score] | None = None,
        aggregation: Callable[[list[Score]], Any] | None = None,
        format: Literal["wide", "long"] = "wide",
    ) -> list[dict[str, Any]]:
        entries: list[dict[str, Any]] = []
        if format == "wide":
            for model_res in self:
                try:
                    model_scores = model_res._get_scores(
                        splits=splits,
                        languages=languages,
                        scripts=scripts,
                        getter=getter,
                        aggregation=aggregation,
                        format="wide",
                    )
                    entries.append(
                        {
                            "model": model_res.model_name,
                            "revision": model_res.model_revision,
                            **model_scores,
                        }
                    )
                except Exception as e:
                    warnings.warn(
                        f"Couldn't get scores for {model_res.model_name}({model_res.model_revision}), due to: {e}"
                    )
        if format == "long":
            for model_res in self:
                try:
                    entries.extend(
                        model_res._get_scores(
                            splits=splits,
                            languages=languages,
                            scripts=scripts,
                            getter=getter,
                            aggregation=aggregation,
                            format="long",
                        )
                    )
                except Exception as e:
                    warnings.warn(
                        f"Couldn't get scores for {model_res.model_name}({model_res.model_revision}), due to: {e}"
                    )
        return entries

    def to_dataframe(
        self,
        aggregation_level: Literal["subset", "split", "task", "language"] = "task",
        aggregation_fn: Callable[[list[Score]], Any] | None = None,
        include_model_revision: bool = False,
        format: Literal["wide", "long"] = "wide",
    ) -> pd.DataFrame:
        """Get a DataFrame with the scores for all models and tasks.

        The DataFrame will have the following columns in addition to the metadata columns:

        - model_name: The name of the model.
        - task_name: The name of the task.
        - score: The main score of the model on the task.

        In addition, the DataFrame can have the following columns depending on the aggregation level:

        - split: The split of the task. E.g. "test", "train", "validation".
        - subset: The subset of the task. E.g. "en", "fr-en".

        Afterward, the DataFrame will be aggregated according to the aggregation method and pivoted to either a wide format.

        Args:
            aggregation_level: The aggregation to use. Can be one of:
                - "subset"/None: No aggregation will be done. The DataFrame will have one row per model, task, split and subset.
                - "split": Aggregates the scores by split. The DataFrame will have one row per model, task and split.
                - "task": Aggregates the scores by task. The DataFrame will have one row per model and task.
                - "language": Aggregates the scores by language. The DataFrame will have one row per model and language.
            aggregation_fn: The function to use for aggregation. If None, the mean will be used.
            include_model_revision: If True, the model revision will be included in the DataFrame. If False, it will be excluded.
                If there are multiple revisions for the same model, they will be joined using the `join_revisions` method.
            format: The format of the DataFrame. Can be one of:
                - "wide": The DataFrame will be of shape (number of tasks, number of models). Scores will be in the cells.
                - "long": The DataFrame will of length (number of tasks * number of model). Scores will be in columns.

        Returns:
            A DataFrame with the scores for all models and tasks.
        """
        df = self._build_pre_agg_df(include_model_revision)
        if df is None:
            msg = "No scores data available. Returning empty DataFrame."
            logger.warning(msg)
            warnings.warn(msg)
            return pd.DataFrame()

        columns = ["model_name"]
        if include_model_revision:
            columns.append("model_revision")

        result = _aggregate_and_pivot(
            df,
            columns=columns,
            aggregation_level=aggregation_level,
            aggregation_fn=aggregation_fn,
            format=format,
        )
        # Cast categorical columns back to object so downstream string ops don't
        # raise "can only concatenate str (not Categorical) to str".
        for col in result.select_dtypes(include="category").columns:
            result[col] = result[col].astype(object)
        return result

    def _build_pre_agg_df(self, include_model_revision: bool) -> pd.DataFrame | None:
        """Build the pre-aggregation long DataFrame; returns None when no scores exist."""
        bench_results = self
        if include_model_revision is False:
            bench_results = bench_results.join_revisions()

        # Collect parallel arrays rather than a list of dicts:
        # pd.DataFrame(dict_of_lists) is ~10x faster than pd.DataFrame(list_of_dicts).
        col_model_name: list[Any] = []
        col_model_rev: list[Any] = []
        col_task_name: list[Any] = []
        col_split: list[Any] = []
        col_language: list[Any] = []
        col_subset: list[Any] = []
        col_score: list[Any] = []

        for model_result in bench_results:
            mn = model_result.model_name
            mr = model_result.model_revision
            for task_result in model_result.task_results:
                tn = task_result.task_name
                for split, scores_list in task_result.scores.items():
                    for score_item in scores_list:
                        col_model_name.append(mn)
                        col_model_rev.append(mr)
                        col_task_name.append(tn)
                        col_split.append(split)
                        col_language.append(score_item.get("languages", ["Unknown"]))
                        col_subset.append(score_item.get("hf_subset", "default"))
                        col_score.append(score_item.get("main_score", None))

        if not col_model_name:
            return None

        df = pd.DataFrame(
            {
                "model_name": col_model_name,
                "model_revision": col_model_rev,
                "task_name": col_task_name,
                "split": col_split,
                "language": col_language,
                "subset": col_subset,
                "score": col_score,
            }
        )
        if include_model_revision is False:
            df = df.drop(columns=["model_revision"])
        # Categoricals shrink memory ~4x for high-cardinality string columns.
        for col in ("model_name", "task_name", "split", "subset"):
            if col in df.columns:
                df[col] = df[col].astype("category")
        return df

    def get_aggregated_scores(
        self,
    ) -> (
        dict[str, dict[str, float | None]]
        | dict[str, dict[str, dict[str, float | None]]]  # multiple benchmarks
    ):
        """Get aggregated scores for each model.

        When a benchmark is associated with these results, uses
        :meth:`Benchmark.get_score` to compute scores.  Otherwise computes
        the equivalent statistics directly from all task results.

        Returns:
            A dict mapping each model name to a dict with the keys:

            - ``"Mean(Task)"``: mean score across all (benchmark) tasks.
            - ``"Mean(TaskType)"``: mean of per-task-type means.

        Examples:
            >>> bench_results.get_aggregated_scores()
            {
                "model1": {"Mean(Task)": 0.5, "Mean(TaskType)": 0.52},
                "model2": {"Mean(Task)": 0.45, "Mean(TaskType)": 0.48},
            }
        """
        if self.benchmark is not None:
            if isinstance(self.benchmark, Sequence):
                return {b.name: b.get_score(self) for b in self.benchmark}
            return self.benchmark.get_score(self)

        from mteb.benchmarks._benchmark_metrics import (
            _compute_mean_task,
            _compute_mean_task_type,
        )

        bench_results = self.join_revisions()
        return {
            model_result.model_name: {
                "Mean(Task)": _compute_mean_task(model_result.task_results),
                "Mean(TaskType)": _compute_mean_task_type(model_result.task_results),
            }
            for model_result in bench_results
        }

    def get_benchmark_result(self) -> pd.DataFrame:
        """Get aggregated scores for each model in the benchmark.

        Uses the benchmark's summary table creation method to compute scores.

        Returns:
            A DataFrame with the aggregated benchmark scores for each model.
        """
        if self.benchmark is None:
            raise ValueError(
                "No benchmark associated with these results (self.benchmark is None). "
                "To get benchmark results, load results with a Benchmark object. "
                "`results = cache.load_results(tasks='MTEB(eng, v2)')`"
            )

        if isinstance(self.benchmark, Sequence):
            raise ValueError("Getting scores for multiple benchmarks is unsupported")

        return self.benchmark._create_summary_table(self)

    def __iter__(self) -> Iterator[ModelResult]:  # type: ignore[override]
        return iter(self.model_results)

    def __getitem__(self, index: int) -> ModelResult:
        return self.model_results[index]

    def to_dict(self) -> dict[str, Any]:
        """Convert BenchmarkResults to a dictionary."""
        return self.model_dump()

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> Self:
        """Create BenchmarkResults from a dictionary."""
        return cls.model_validate(data)

    def to_disk(self, path: Path | str) -> None:
        """Save the BenchmarkResults to a JSON file."""
        path = Path(path)
        with path.open("w") as out_file:
            out_file.write(self.model_dump_json(indent=2))

    @classmethod
    def from_validated(cls, **data: Any) -> BenchmarkResults:
        """Create BenchmarkResults from validated data.

        Args:
            **data: Arbitrary keyword arguments containing the data.

        Returns:
            An instance of BenchmarkResults.
        """
        model_results = []
        for model_res in data["model_results"]:
            model_results.append(ModelResult.from_validated(**model_res))
        return cls.model_construct(model_results=model_results)

    @classmethod
    def from_disk(cls, path: Path | str) -> Self:
        """Load the BenchmarkResults from a JSON file.

        Args:
            path: Path to the JSON file.

        Returns:
            An instance of BenchmarkResults.
        """
        path = Path(path)
        with path.open() as in_file:
            data = json.loads(in_file.read())
        return cls.from_dict(data)

    @property
    def languages(self) -> list[str]:
        """Get all languages in the benchmark results.

        Returns:
            A list of languages in ISO 639-1 format.
        """
        langs = []
        for model_res in self.model_results:
            langs.extend(model_res.languages)
        return list(set(langs))

    @property
    def domains(self) -> list[str]:
        """Get all domains in the benchmark results.

        Returns:
            A list of domains in ISO 639-1 format.
        """
        ds = []
        for model_res in self.model_results:
            ds.extend(model_res.domains)
        return list(set(ds))

    @property
    def task_types(self) -> list[str]:
        """Get all task types in the benchmark results.

        Returns:
            A list of task types.
        """
        ts = []
        for model_res in self.model_results:
            ts.extend(model_res.task_types)
        return list(set(ts))

    @property
    def task_names(self) -> list[str]:
        """Get all task names in the benchmark results.

        Returns:
            A list of task names.
        """
        names = []
        for model_res in self.model_results:
            names.extend(model_res.task_names)
        return list(set(names))

    @property
    def modalities(self) -> list[str]:
        """Get all modalities in the benchmark results.

        Returns:
            A list of modalities.
        """
        mod = []
        for model_res in self.model_results:
            mod.extend(model_res.modalities)
        return list(set(mod))

    @property
    def model_names(self) -> list[str]:
        """Get all model names in the benchmark results.

        Returns:
            A list of model names.
        """
        return [model_res.model_name for model_res in self.model_results]

    @property
    def model_revisions(self) -> list[dict[str, str | None]]:
        """Get all model revisions in the benchmark results.

        Returns:
            A list of dictionaries with model names and revisions.
        """
        return [
            {"model_name": model_res.model_name, "revision": model_res.model_revision}
            for model_res in self.model_results
        ]

`domains` `property` ¶

Get all domains in the benchmark results.

Returns:

Type	Description
`list[str]`	A list of domains in ISO 639-1 format.

`languages` `property` ¶

Get all languages in the benchmark results.

Returns:

Type	Description
`list[str]`	A list of languages in ISO 639-1 format.

`modalities` `property` ¶

Get all modalities in the benchmark results.

Returns:

Type	Description
`list[str]`	A list of modalities.

`model_names` `property` ¶

Get all model names in the benchmark results.

Returns:

Type	Description
`list[str]`	A list of model names.

`model_revisions` `property` ¶

Get all model revisions in the benchmark results.

Returns:

Type	Description
`list[dict[str, str \| None]]`	A list of dictionaries with model names and revisions.

`task_names` `property` ¶

Get all task names in the benchmark results.

Returns:

Type	Description
`list[str]`	A list of task names.

`task_types` `property` ¶

Get all task types in the benchmark results.

Returns:

Type	Description
`list[str]`	A list of task types.

`from_dict(data)` `classmethod` ¶

Create BenchmarkResults from a dictionary.

Source code in mteb/results/benchmark_results.py

@classmethod
def from_dict(cls, data: dict[str, Any]) -> Self:
    """Create BenchmarkResults from a dictionary."""
    return cls.model_validate(data)

`from_disk(path)` `classmethod` ¶

Load the BenchmarkResults from a JSON file.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	Path to the JSON file.	required

Returns:

Type	Description
`Self`	An instance of BenchmarkResults.

Source code in mteb/results/benchmark_results.py

@classmethod
def from_disk(cls, path: Path | str) -> Self:
    """Load the BenchmarkResults from a JSON file.

    Args:
        path: Path to the JSON file.

    Returns:
        An instance of BenchmarkResults.
    """
    path = Path(path)
    with path.open() as in_file:
        data = json.loads(in_file.read())
    return cls.from_dict(data)

`from_validated(**data)` `classmethod` ¶

Create BenchmarkResults from validated data.

Parameters:

Name	Type	Description	Default
`**data`	`Any`	Arbitrary keyword arguments containing the data.	`{}`

Returns:

Type	Description
`BenchmarkResults`	An instance of BenchmarkResults.

Source code in mteb/results/benchmark_results.py

@classmethod
def from_validated(cls, **data: Any) -> BenchmarkResults:
    """Create BenchmarkResults from validated data.

    Args:
        **data: Arbitrary keyword arguments containing the data.

    Returns:
        An instance of BenchmarkResults.
    """
    model_results = []
    for model_res in data["model_results"]:
        model_results.append(ModelResult.from_validated(**model_res))
    return cls.model_construct(model_results=model_results)

`get_aggregated_scores()` ¶

Get aggregated scores for each model.

When a benchmark is associated with these results, uses :meth:Benchmark.get_score to compute scores. Otherwise computes the equivalent statistics directly from all task results.

Returns:

Type	Description
`dict[str, dict[str, float \| None]] \| dict[str, dict[str, dict[str, float \| None]]]`	A dict mapping each model name to a dict with the keys:
`dict[str, dict[str, float \| None]] \| dict[str, dict[str, dict[str, float \| None]]]`	`"Mean(Task)"`: mean score across all (benchmark) tasks.
`dict[str, dict[str, float \| None]] \| dict[str, dict[str, dict[str, float \| None]]]`	`"Mean(TaskType)"`: mean of per-task-type means.

Examples:

>>> bench_results.get_aggregated_scores()
{
    "model1": {"Mean(Task)": 0.5, "Mean(TaskType)": 0.52},
    "model2": {"Mean(Task)": 0.45, "Mean(TaskType)": 0.48},
}

Source code in mteb/results/benchmark_results.py

def get_aggregated_scores(
    self,
) -> (
    dict[str, dict[str, float | None]]
    | dict[str, dict[str, dict[str, float | None]]]  # multiple benchmarks
):
    """Get aggregated scores for each model.

    When a benchmark is associated with these results, uses
    :meth:`Benchmark.get_score` to compute scores.  Otherwise computes
    the equivalent statistics directly from all task results.

    Returns:
        A dict mapping each model name to a dict with the keys:

        - ``"Mean(Task)"``: mean score across all (benchmark) tasks.
        - ``"Mean(TaskType)"``: mean of per-task-type means.

    Examples:
        >>> bench_results.get_aggregated_scores()
        {
            "model1": {"Mean(Task)": 0.5, "Mean(TaskType)": 0.52},
            "model2": {"Mean(Task)": 0.45, "Mean(TaskType)": 0.48},
        }
    """
    if self.benchmark is not None:
        if isinstance(self.benchmark, Sequence):
            return {b.name: b.get_score(self) for b in self.benchmark}
        return self.benchmark.get_score(self)

    from mteb.benchmarks._benchmark_metrics import (
        _compute_mean_task,
        _compute_mean_task_type,
    )

    bench_results = self.join_revisions()
    return {
        model_result.model_name: {
            "Mean(Task)": _compute_mean_task(model_result.task_results),
            "Mean(TaskType)": _compute_mean_task_type(model_result.task_results),
        }
        for model_result in bench_results
    }

`get_benchmark_result()` ¶

Get aggregated scores for each model in the benchmark.

Uses the benchmark's summary table creation method to compute scores.

Returns:

Type	Description
`DataFrame`	A DataFrame with the aggregated benchmark scores for each model.

Source code in mteb/results/benchmark_results.py

def get_benchmark_result(self) -> pd.DataFrame:
    """Get aggregated scores for each model in the benchmark.

    Uses the benchmark's summary table creation method to compute scores.

    Returns:
        A DataFrame with the aggregated benchmark scores for each model.
    """
    if self.benchmark is None:
        raise ValueError(
            "No benchmark associated with these results (self.benchmark is None). "
            "To get benchmark results, load results with a Benchmark object. "
            "`results = cache.load_results(tasks='MTEB(eng, v2)')`"
        )

    if isinstance(self.benchmark, Sequence):
        raise ValueError("Getting scores for multiple benchmarks is unsupported")

    return self.benchmark._create_summary_table(self)

`join_revisions()` ¶

Join revisions of the same model.

In case of conflicts, the following rules are applied: 1) If the main revision is present, it is kept. The main revision is the defined in the models ModelMeta object. 2) If there is multiple revisions and some of them are None or na, they are filtered out. 3) If there is no main revision, we prefer the one run using the latest mteb version.

Returns:

Type	Description
`BenchmarkResults`	A new BenchmarkResults object with the revisions joined.

Source code in mteb/results/benchmark_results.py

def join_revisions(self) -> BenchmarkResults:
    """Join revisions of the same model.

    In case of conflicts, the following rules are applied:
    1) If the main revision is present, it is kept. The main revision is the defined in the models ModelMeta object.
    2) If there is multiple revisions and some of them are None or na, they are filtered out.
    3) If there is no main revision, we prefer the one run using the latest mteb version.

    Returns:
        A new BenchmarkResults object with the revisions joined.
    """
    records = []
    for model_result in self:
        for task_result in model_result.task_results:
            records.append(
                dict(
                    model=model_result.model_name,
                    revision=model_result.model_revision,
                    task_name=task_result.task_name,
                    mteb_version=task_result.mteb_version,
                    task_result=task_result,
                    has_scores=bool(task_result.scores),
                )
            )
    if not records:
        return BenchmarkResults.model_construct(model_results=[])
    task_df = pd.DataFrame.from_records(records)

    # Use cached model metas
    model_to_main_revision = _get_cached_model_metas()
    task_df["main_revision"] = task_df["model"].map(model_to_main_revision)

    # Use cached version parsing
    task_df["mteb_version"] = task_df["mteb_version"].map(_parse_version_cached)

    # Filter out rows without scores first
    task_df = task_df[task_df["has_scores"]]

    # Optimize groupby with vectorized operations
    # Sort by priority: main_revision match, then mteb_version (descending), then revision
    task_df["is_main_revision"] = task_df["revision"] == task_df["main_revision"]

    # Handle None/NA/external revisions
    task_df["revision_clean"] = task_df["revision"].copy()
    task_df.loc[task_df["revision"].isna(), "revision_clean"] = (
        "no_revision_available"
    )
    task_df.loc[task_df["revision"] == "external", "revision_clean"] = (
        "no_revision_available"
    )

    # Create a priority column for sorting
    # Higher priority = better to keep
    # Priority: main_revision (1000), has valid mteb_version (100), has valid revision (10)
    task_df["priority"] = 0
    task_df.loc[task_df["is_main_revision"], "priority"] += 1000
    task_df.loc[task_df["mteb_version"].notna(), "priority"] += 100
    task_df.loc[
        task_df["revision_clean"] != "no_revision_available", "priority"
    ] += 10

    # Sort by priority (desc), mteb_version (desc), and take first per group
    task_df = task_df.sort_values(
        ["model", "task_name", "priority", "mteb_version"],
        ascending=[True, True, False, False],
        na_position="last",
    )

    task_df = task_df.groupby(["model", "task_name"], as_index=False).first()

    # Reconstruct model results
    model_results = []
    # Group by original revision to maintain deterministic behavior
    # After the first() selection above, each (model, task_name) is unique,
    # so grouping by original revision ensures consistent ModelResult creation
    for (model, model_revision), group in task_df.groupby(["model", "revision"]):
        model_result = ModelResult.model_construct(
            model_name=model,  # type: ignore[arg-type]
            model_revision=model_revision,  # type: ignore[arg-type]
            task_results=list(group["task_result"]),
        )
        model_results.append(model_result)
    return BenchmarkResults.model_construct(model_results=model_results)

`select_models(names, revisions=None)` ¶

Get models by name and revision.

Parameters:

Name	Type	Description	Default
`names`	`list[str] \| list[ModelMeta]`	List of model names to filter by. Can also be a list of ModelMeta objects. In which case, the revision is ignored.	required
`revisions`	`list[str \| None] \| None`	List of model revisions to filter by. If None, all revisions are returned.	`None`

Returns:

Type	Description
`BenchmarkResults`	A new BenchmarkResults object with the filtered models.

Source code in mteb/results/benchmark_results.py

def select_models(
    self,
    names: list[str] | list[ModelMeta],
    revisions: list[str | None] | None = None,
) -> BenchmarkResults:
    """Get models by name and revision.

    Args:
        names: List of model names to filter by. Can also be a list of ModelMeta objects. In which case, the revision is ignored.
        revisions: List of model revisions to filter by. If None, all revisions are returned.

    Returns:
        A new BenchmarkResults object with the filtered models.
    """
    models_res = []
    _revisions = revisions if revisions is not None else [None] * len(names)

    name_rev: dict[str, str | None] = {}

    if len(names) != len(_revisions):
        raise ValueError(
            "The length of names and revisions must be the same or revisions must be None."
        )

    for name, revision in zip(names, _revisions):
        if isinstance(name, ModelMeta):
            if name.name is None:
                raise ValueError("name in ModelMeta is None. It must be a string.")
            name_rev[name.name] = name.revision
        else:
            name_ = cast("str", name)
            name_rev[name_] = revision

    for model_res in self.model_results:
        model_name = model_res.model_name
        revision = model_res.model_revision
        if model_name in name_rev:
            if name_rev[model_name] is None or revision == name_rev[model_name]:
                models_res.append(model_res)

    return type(self).model_construct(model_results=models_res)

`select_tasks(tasks)` ¶

Select tasks from the benchmark results.

Parameters:

Name	Type	Description	Default
`tasks`	`Iterable[AbsTask]`	List of tasks to select. Can be a list of AbsTask objects or task names.	required

Returns:

Type	Description
`BenchmarkResults`	A new BenchmarkResults object with the selected tasks.

Source code in mteb/results/benchmark_results.py

def select_tasks(self, tasks: Iterable[AbsTask]) -> BenchmarkResults:
    """Select tasks from the benchmark results.

    Args:
        tasks: List of tasks to select. Can be a list of AbsTask objects or task names.

    Returns:
        A new BenchmarkResults object with the selected tasks.
    """
    new_model_results = [
        model_res.select_tasks(tasks) for model_res in self.model_results
    ]
    return type(self).model_construct(model_results=new_model_results)

`to_dataframe(aggregation_level='task', aggregation_fn=None, include_model_revision=False, format='wide')` ¶

Get a DataFrame with the scores for all models and tasks.

The DataFrame will have the following columns in addition to the metadata columns:

model_name: The name of the model.
task_name: The name of the task.
score: The main score of the model on the task.

In addition, the DataFrame can have the following columns depending on the aggregation level:

split: The split of the task. E.g. "test", "train", "validation".
subset: The subset of the task. E.g. "en", "fr-en".

Afterward, the DataFrame will be aggregated according to the aggregation method and pivoted to either a wide format.

Parameters:

Name	Type	Description	Default
`aggregation_level`	`Literal['subset', 'split', 'task', 'language']`	The aggregation to use. Can be one of: - "subset"/None: No aggregation will be done. The DataFrame will have one row per model, task, split and subset. - "split": Aggregates the scores by split. The DataFrame will have one row per model, task and split. - "task": Aggregates the scores by task. The DataFrame will have one row per model and task. - "language": Aggregates the scores by language. The DataFrame will have one row per model and language.	`'task'`
`aggregation_fn`	`Callable[[list[Score]], Any] \| None`	The function to use for aggregation. If None, the mean will be used.	`None`
`include_model_revision`	`bool`	If True, the model revision will be included in the DataFrame. If False, it will be excluded. If there are multiple revisions for the same model, they will be joined using the `join_revisions` method.	`False`
`format`	`Literal['wide', 'long']`	The format of the DataFrame. Can be one of: - "wide": The DataFrame will be of shape (number of tasks, number of models). Scores will be in the cells. - "long": The DataFrame will of length (number of tasks * number of model). Scores will be in columns.	`'wide'`

Returns:

Type	Description
`DataFrame`	A DataFrame with the scores for all models and tasks.

Source code in mteb/results/benchmark_results.py

def to_dataframe(
    self,
    aggregation_level: Literal["subset", "split", "task", "language"] = "task",
    aggregation_fn: Callable[[list[Score]], Any] | None = None,
    include_model_revision: bool = False,
    format: Literal["wide", "long"] = "wide",
) -> pd.DataFrame:
    """Get a DataFrame with the scores for all models and tasks.

    The DataFrame will have the following columns in addition to the metadata columns:

    - model_name: The name of the model.
    - task_name: The name of the task.
    - score: The main score of the model on the task.

    In addition, the DataFrame can have the following columns depending on the aggregation level:

    - split: The split of the task. E.g. "test", "train", "validation".
    - subset: The subset of the task. E.g. "en", "fr-en".

    Afterward, the DataFrame will be aggregated according to the aggregation method and pivoted to either a wide format.

    Args:
        aggregation_level: The aggregation to use. Can be one of:
            - "subset"/None: No aggregation will be done. The DataFrame will have one row per model, task, split and subset.
            - "split": Aggregates the scores by split. The DataFrame will have one row per model, task and split.
            - "task": Aggregates the scores by task. The DataFrame will have one row per model and task.
            - "language": Aggregates the scores by language. The DataFrame will have one row per model and language.
        aggregation_fn: The function to use for aggregation. If None, the mean will be used.
        include_model_revision: If True, the model revision will be included in the DataFrame. If False, it will be excluded.
            If there are multiple revisions for the same model, they will be joined using the `join_revisions` method.
        format: The format of the DataFrame. Can be one of:
            - "wide": The DataFrame will be of shape (number of tasks, number of models). Scores will be in the cells.
            - "long": The DataFrame will of length (number of tasks * number of model). Scores will be in columns.

    Returns:
        A DataFrame with the scores for all models and tasks.
    """
    df = self._build_pre_agg_df(include_model_revision)
    if df is None:
        msg = "No scores data available. Returning empty DataFrame."
        logger.warning(msg)
        warnings.warn(msg)
        return pd.DataFrame()

    columns = ["model_name"]
    if include_model_revision:
        columns.append("model_revision")

    result = _aggregate_and_pivot(
        df,
        columns=columns,
        aggregation_level=aggregation_level,
        aggregation_fn=aggregation_fn,
        format=format,
    )
    # Cast categorical columns back to object so downstream string ops don't
    # raise "can only concatenate str (not Categorical) to str".
    for col in result.select_dtypes(include="category").columns:
        result[col] = result[col].astype(object)
    return result

`to_dict()` ¶

Convert BenchmarkResults to a dictionary.

Source code in mteb/results/benchmark_results.py

def to_dict(self) -> dict[str, Any]:
    """Convert BenchmarkResults to a dictionary."""
    return self.model_dump()

`to_disk(path)` ¶

Save the BenchmarkResults to a JSON file.

Source code in mteb/results/benchmark_results.py

def to_disk(self, path: Path | str) -> None:
    """Save the BenchmarkResults to a JSON file."""
    path = Path(path)
    with path.open("w") as out_file:
        out_file.write(self.model_dump_json(indent=2))

Results¶

Results cache¶

mteb.cache.ResultCache ¶

default_cache_path property ¶

has_remote property ¶

remote_repo_path property ¶

remote_results_path property ¶

clear_cache() ¶

download_from_remote(remote='https://github.com/embeddings-benchmark/results', download_latest=True, revision=None) ¶

get_cache_paths(models=None, tasks=None, require_model_meta=True, include_remote=True, load_experiments=LoadExperimentEnum.NO_EXPERIMENTS) ¶

get_models(tasks=None, require_model_meta=True, include_remote=True) ¶

get_task_names(models=None, require_model_meta=True, include_remote=True) ¶

get_task_result_path(task_name, model_name, model_revision=None, remote=False, experiment_name=None) ¶

load_results(models=None, tasks=None, *, require_model_meta=True, include_remote=True, validate_and_filter=False, only_main_score=False, load_experiments=LoadExperimentEnum.MATCH_KWARGS, experiment_kwargs=None) ¶

load_task_result(task_name, model_name, model_revision=None, raise_if_not_found=False, prioritize_remote=False, experiment_name=None) ¶

save_to_cache(task_result, model_name, model_revision=None, *, encode_kwargs=None) ¶

submit_results(models=None, *, create_pr=False) ¶

Result Objects¶

mteb.results.TaskResult ¶

domains property ¶

eval_splits property ¶

hf_subsets property ¶

is_public property ¶

languages property ¶

main_score property ¶

task cached property ¶

task_type property ¶

from_dict(data) classmethod ¶

from_disk(path, load_historic_data=True) classmethod ¶

from_task_results(task, scores, evaluation_time, kg_co2_emissions=None, date=None) classmethod ¶

from_validated(**data) classmethod ¶

get_hf_eval_results() ¶

get_missing_evaluations(task) ¶

get_score(splits=None, languages=None, scripts=None, getter=lambda scores: scores['main_score'], aggregation=np.mean) ¶

is_mergeable(result, criteria=['dataset_revision'], raise_error=False) ¶

merge(new_results, criteria=['dataset_revision']) ¶

only_main_score() ¶

to_dict() ¶

to_disk(path) ¶

validate_and_filter_scores(task=None) ¶

mteb.results.ModelResult ¶

domains property ¶

languages property ¶

modalities property ¶

task_names property ¶

task_types property ¶

from_disk(path) classmethod ¶

from_validated(**data) classmethod ¶

push_model_results(user=None, *, benchmark=None, create_pr=False, raise_error=False) ¶

select_tasks(tasks) ¶

to_dataframe(aggregation_level='task', aggregation_fn=None, include_model_revision=False, format='wide') ¶

to_disk(path) ¶

mteb.results.BenchmarkResults ¶

domains property ¶

languages property ¶

modalities property ¶

model_names property ¶

model_revisions property ¶

task_names property ¶

task_types property ¶

from_dict(data) classmethod ¶

from_disk(path) classmethod ¶

from_validated(**data) classmethod ¶

get_aggregated_scores() ¶

get_benchmark_result() ¶

join_revisions() ¶

select_models(names, revisions=None) ¶

select_tasks(tasks) ¶

to_dataframe(aggregation_level='task', aggregation_fn=None, include_model_revision=False, format='wide') ¶

to_dict() ¶

to_disk(path) ¶

`mteb.cache.ResultCache` ¶

`default_cache_path` `property` ¶

`has_remote` `property` ¶

`remote_repo_path` `property` ¶

`remote_results_path` `property` ¶

`clear_cache()` ¶

`download_from_remote(remote='https://github.com/embeddings-benchmark/results', download_latest=True, revision=None)` ¶

`get_cache_paths(models=None, tasks=None, require_model_meta=True, include_remote=True, load_experiments=LoadExperimentEnum.NO_EXPERIMENTS)` ¶

`get_models(tasks=None, require_model_meta=True, include_remote=True)` ¶

`get_task_names(models=None, require_model_meta=True, include_remote=True)` ¶

`get_task_result_path(task_name, model_name, model_revision=None, remote=False, experiment_name=None)` ¶

`load_results(models=None, tasks=None, *, require_model_meta=True, include_remote=True, validate_and_filter=False, only_main_score=False, load_experiments=LoadExperimentEnum.MATCH_KWARGS, experiment_kwargs=None)` ¶

`load_task_result(task_name, model_name, model_revision=None, raise_if_not_found=False, prioritize_remote=False, experiment_name=None)` ¶

`save_to_cache(task_result, model_name, model_revision=None, *, encode_kwargs=None)` ¶

`submit_results(models=None, *, create_pr=False)` ¶

`mteb.results.TaskResult` ¶

`domains` `property` ¶

`eval_splits` `property` ¶

`hf_subsets` `property` ¶

`is_public` `property` ¶

`languages` `property` ¶

`main_score` `property` ¶

`task` `cached` `property` ¶

`task_type` `property` ¶

`from_dict(data)` `classmethod` ¶

`from_disk(path, load_historic_data=True)` `classmethod` ¶

`from_task_results(task, scores, evaluation_time, kg_co2_emissions=None, date=None)` `classmethod` ¶

`from_validated(**data)` `classmethod` ¶

`get_hf_eval_results()` ¶

`get_missing_evaluations(task)` ¶

`get_score(splits=None, languages=None, scripts=None, getter=lambda scores: scores['main_score'], aggregation=np.mean)` ¶

`is_mergeable(result, criteria=['dataset_revision'], raise_error=False)` ¶

`merge(new_results, criteria=['dataset_revision'])` ¶

`only_main_score()` ¶

`to_dict()` ¶

`to_disk(path)` ¶

`validate_and_filter_scores(task=None)` ¶

`mteb.results.ModelResult` ¶

`domains` `property` ¶

`languages` `property` ¶

`modalities` `property` ¶

`task_names` `property` ¶

`task_types` `property` ¶

`from_disk(path)` `classmethod` ¶

`from_validated(**data)` `classmethod` ¶

`push_model_results(user=None, *, benchmark=None, create_pr=False, raise_error=False)` ¶

`select_tasks(tasks)` ¶

`to_dataframe(aggregation_level='task', aggregation_fn=None, include_model_revision=False, format='wide')` ¶

`to_disk(path)` ¶

`mteb.results.BenchmarkResults` ¶

`domains` `property` ¶

`languages` `property` ¶

`modalities` `property` ¶

`model_names` `property` ¶

`model_revisions` `property` ¶

`task_names` `property` ¶

`task_types` `property` ¶

`from_dict(data)` `classmethod` ¶

`from_disk(path)` `classmethod` ¶

`from_validated(**data)` `classmethod` ¶

`get_aggregated_scores()` ¶

`get_benchmark_result()` ¶

`join_revisions()` ¶

`select_models(names, revisions=None)` ¶

`select_tasks(tasks)` ¶

`to_dataframe(aggregation_level='task', aggregation_fn=None, include_model_revision=False, format='wide')` ¶

`to_dict()` ¶

`to_disk(path)` ¶