Skip to content

API

General

General function for dealing with tasks and models implemented in SEB.

seb.registries.get_task(name)

Fetches a task by name.

Parameters:

Name Type Description Default
name str

The name of the task.

required

Returns:

Type Description
Task

A task.

Source code in seb/registries.py
def get_task(name: str) -> Task:
    """
    Fetches a task by name.

    Args:
        name: The name of the task.

    Returns:
        A task.
    """
    return tasks.get(name)()

seb.registries.get_all_tasks()

Returns all tasks implemented in SEB.

Returns:

Type Description
list

A list of all tasks in SEB.

Source code in seb/registries.py
def get_all_tasks() -> list[Task]:
    """
    Returns all tasks implemented in SEB.

    Returns:
        A list of all tasks in SEB.
    """
    return [get_task(task_name) for task_name in tasks.get_all()]

seb.registries.get_model(name)

Fetches a model by name.

Parameters:

Name Type Description Default
name str

The name of the model.

required

Returns:

Type Description
SebModel

A model including metadata.

Source code in seb/registries.py
def get_model(name: str) -> SebModel:
    """
    Fetches a model by name.

    Args:
        name: The name of the model.

    Returns:
        A model including metadata.
    """
    return models.get(name)()

seb.registries.get_all_models()

Get all the models implemented in SEB.

Returns:

Type Description
list

A list of all models in SEB.

Source code in seb/registries.py
def get_all_models() -> list[SebModel]:
    """
    Get all the models implemented in SEB.

    Returns:
        A list of all models in SEB.
    """
    return [get_model(model_name) for model_name in models.get_all()]

Benchmark

seb.benchmark.Benchmark

Benchmark is the main orchestrator of the SEB benchmark.

Source code in seb/benchmark.py
class Benchmark:
    """
    Benchmark is the main orchestrator of the SEB benchmark.
    """

    def __init__(
        self,
        languages: Optional[list[str]] = None,
        tasks: Optional[Union[Iterable[str], Iterable[Task]]] = None,
    ) -> None:
        """
        Initialize the benchmark.

        Args:
            languages: A list of languages to run the benchmark on. If None, all languages are used.
            tasks: The tasks to run the benchmark on. If None, all tasks are used. Can either be specified as strings or as Task objects.
        """
        self.languages = languages

        self.tasks = self.get_tasks(tasks, languages)
        self.task_names = [task.name for task in self.tasks]

    @staticmethod
    def get_tasks(
        tasks: Optional[Union[Iterable[str], Iterable[Task]]],
        languages: Optional[list[str]],
    ) -> list[Task]:
        """
        Get the tasks for the benchmark.

        Returns:
            A list of tasks.
        """
        _tasks = []

        if tasks is None:
            _tasks = get_all_tasks()
        else:
            for task in tasks:
                if isinstance(task, str):
                    _tasks.append(get_task(task))
                elif isinstance(task, Task):
                    _tasks.append(task)
                else:
                    raise ValueError(f"Invalid task type: {type(task)}")

        if languages is not None:
            langs = set(languages)
            _tasks = [task for task in _tasks if set(task.languages) & langs]

        return _tasks

    def evaluate_model(
        self,
        model: SebModel,
        use_cache: bool = True,
        run_model: bool = True,
        raise_errors: bool = True,
        cache_dir: Optional[Path] = None,
        verbose: bool = True,
    ) -> BenchmarkResults:
        """
        Evaluate a model on the benchmark.

        Args:
            model: The model to evaluate.
            use_cache: Whether to use the cache.
            run_model: Whether to run the model if the cache is not present.
            raise_errors: Whether to raise errors.
            cache_dir: The cache directory to use. If None, the default cache directory is used.
            verbose: Whether to show a progress bar.

        Returns:
            The results of the benchmark.
        """
        task_results = []
        pbar = tqdm(
            self.tasks,
            position=1,
            desc=f"Running {model.meta.name}",
            leave=False,
            disable=not verbose,
        )
        for task in pbar:
            pbar.set_description(f"Running {model.meta.name} on {task.name}")
            task_result = run_task(
                task,
                model,
                use_cache=use_cache,
                run_model=run_model,
                raise_errors=raise_errors,
                cache_dir=cache_dir,
            )
            task_results.append(task_result)

        return BenchmarkResults(meta=model.meta, task_results=task_results)

    def evaluate_models(
        self,
        models: list[SebModel],
        use_cache: bool = True,
        run_model: bool = True,
        raise_errors: bool = True,
        cache_dir: Optional[Path] = None,
        verbose: bool = True,
    ) -> list[BenchmarkResults]:
        """
        Evaluate a list of models on the benchmark.

        Args:
            models: The models to evaluate.
            use_cache: Whether to use the cache.
            run_model: Whether to run the model if the cache is not present.
            raise_errors: Whether to raise errors.
            cache_dir: The cache directory to use. If None, the default cache directory is used.
            verbose: Whether to show a progress bar.

        Returns:
            The results of the benchmark, once for each model.
        """
        results = []
        pbar = tqdm(
            models,
            position=0,
            desc="Running Benchmark",
            leave=True,
            disable=not verbose,
        )

        for model in pbar:
            pbar.set_description(f"Running {model.meta.name}")
            results.append(
                self.evaluate_model(
                    model,
                    use_cache=use_cache,
                    run_model=run_model,
                    raise_errors=raise_errors,
                    cache_dir=cache_dir,
                    verbose=verbose,
                ),
            )
        return results

__init__(self, languages=None, tasks=None) special

Initialize the benchmark.

Parameters:

Name Type Description Default
languages Optional[list[str]]

A list of languages to run the benchmark on. If None, all languages are used.

None
tasks Union[collections.abc.Iterable[str], collections.abc.Iterable[seb.interfaces.task.Task]]

The tasks to run the benchmark on. If None, all tasks are used. Can either be specified as strings or as Task objects.

None
Source code in seb/benchmark.py
def __init__(
    self,
    languages: Optional[list[str]] = None,
    tasks: Optional[Union[Iterable[str], Iterable[Task]]] = None,
) -> None:
    """
    Initialize the benchmark.

    Args:
        languages: A list of languages to run the benchmark on. If None, all languages are used.
        tasks: The tasks to run the benchmark on. If None, all tasks are used. Can either be specified as strings or as Task objects.
    """
    self.languages = languages

    self.tasks = self.get_tasks(tasks, languages)
    self.task_names = [task.name for task in self.tasks]

evaluate_model(self, model, use_cache=True, run_model=True, raise_errors=True, cache_dir=None, verbose=True)

Evaluate a model on the benchmark.

Parameters:

Name Type Description Default
model SebModel

The model to evaluate.

required
use_cache bool

Whether to use the cache.

True
run_model bool

Whether to run the model if the cache is not present.

True
raise_errors bool

Whether to raise errors.

True
cache_dir Optional[pathlib.Path]

The cache directory to use. If None, the default cache directory is used.

None
verbose bool

Whether to show a progress bar.

True

Returns:

Type Description
BenchmarkResults

The results of the benchmark.

Source code in seb/benchmark.py
def evaluate_model(
    self,
    model: SebModel,
    use_cache: bool = True,
    run_model: bool = True,
    raise_errors: bool = True,
    cache_dir: Optional[Path] = None,
    verbose: bool = True,
) -> BenchmarkResults:
    """
    Evaluate a model on the benchmark.

    Args:
        model: The model to evaluate.
        use_cache: Whether to use the cache.
        run_model: Whether to run the model if the cache is not present.
        raise_errors: Whether to raise errors.
        cache_dir: The cache directory to use. If None, the default cache directory is used.
        verbose: Whether to show a progress bar.

    Returns:
        The results of the benchmark.
    """
    task_results = []
    pbar = tqdm(
        self.tasks,
        position=1,
        desc=f"Running {model.meta.name}",
        leave=False,
        disable=not verbose,
    )
    for task in pbar:
        pbar.set_description(f"Running {model.meta.name} on {task.name}")
        task_result = run_task(
            task,
            model,
            use_cache=use_cache,
            run_model=run_model,
            raise_errors=raise_errors,
            cache_dir=cache_dir,
        )
        task_results.append(task_result)

    return BenchmarkResults(meta=model.meta, task_results=task_results)

evaluate_models(self, models, use_cache=True, run_model=True, raise_errors=True, cache_dir=None, verbose=True)

Evaluate a list of models on the benchmark.

Parameters:

Name Type Description Default
models list

The models to evaluate.

required
use_cache bool

Whether to use the cache.

True
run_model bool

Whether to run the model if the cache is not present.

True
raise_errors bool

Whether to raise errors.

True
cache_dir Optional[pathlib.Path]

The cache directory to use. If None, the default cache directory is used.

None
verbose bool

Whether to show a progress bar.

True

Returns:

Type Description
list

The results of the benchmark, once for each model.

Source code in seb/benchmark.py
def evaluate_models(
    self,
    models: list[SebModel],
    use_cache: bool = True,
    run_model: bool = True,
    raise_errors: bool = True,
    cache_dir: Optional[Path] = None,
    verbose: bool = True,
) -> list[BenchmarkResults]:
    """
    Evaluate a list of models on the benchmark.

    Args:
        models: The models to evaluate.
        use_cache: Whether to use the cache.
        run_model: Whether to run the model if the cache is not present.
        raise_errors: Whether to raise errors.
        cache_dir: The cache directory to use. If None, the default cache directory is used.
        verbose: Whether to show a progress bar.

    Returns:
        The results of the benchmark, once for each model.
    """
    results = []
    pbar = tqdm(
        models,
        position=0,
        desc="Running Benchmark",
        leave=True,
        disable=not verbose,
    )

    for model in pbar:
        pbar.set_description(f"Running {model.meta.name}")
        results.append(
            self.evaluate_model(
                model,
                use_cache=use_cache,
                run_model=run_model,
                raise_errors=raise_errors,
                cache_dir=cache_dir,
                verbose=verbose,
            ),
        )
    return results

get_tasks(tasks, languages) staticmethod

Get the tasks for the benchmark.

Returns:

Type Description
list

A list of tasks.

Source code in seb/benchmark.py
@staticmethod
def get_tasks(
    tasks: Optional[Union[Iterable[str], Iterable[Task]]],
    languages: Optional[list[str]],
) -> list[Task]:
    """
    Get the tasks for the benchmark.

    Returns:
        A list of tasks.
    """
    _tasks = []

    if tasks is None:
        _tasks = get_all_tasks()
    else:
        for task in tasks:
            if isinstance(task, str):
                _tasks.append(get_task(task))
            elif isinstance(task, Task):
                _tasks.append(task)
            else:
                raise ValueError(f"Invalid task type: {type(task)}")

    if languages is not None:
        langs = set(languages)
        _tasks = [task for task in _tasks if set(task.languages) & langs]

    return _tasks

Interfaces

SEB implements to main interfaces. A task interface which is a tasks within the Benchmark and a model interface which is a model applied to the tasks.

Model Interface

seb.interfaces.model.Encoder (Protocol)

Interface which all models must implement.

Source code in seb/interfaces/model.py
@runtime_checkable
class Encoder(Protocol):
    """
    Interface which all models must implement.
    """

    def encode(
        self,
        sentences: list[str],
        *,
        task: Optional["Task"] = None,
        batch_size: int = 32,
        **kwargs: Any,
    ) -> np.ndarray:
        """Returns a list of embeddings for the given sentences.

        Args:
            sentences: List of sentences to encode
            task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
                to be used.
            batch_size: Batch size for the encoding
            kwargs: arguments to pass to the models encode method

        Returns:
            Embeddings for the given documents
        """
        ...

    # The following methods are optional and can be implemented if the model supports them.
    # def to(self, device: torch.device):
    #     ...

    # def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray:
    #     ...

    # def encode_corpus(self, corpus: list[dict[str, str]], **kwargs: Any) -> np.ndarray:
    #     ...

encode(self, sentences, *, task=None, batch_size=32, **kwargs)

Returns a list of embeddings for the given sentences.

Parameters:

Name Type Description Default
sentences list

List of sentences to encode

required
task Optional[Task]

The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need to be used.

None
batch_size int

Batch size for the encoding

32
kwargs Any

arguments to pass to the models encode method

{}

Returns:

Type Description
ndarray

Embeddings for the given documents

Source code in seb/interfaces/model.py
def encode(
    self,
    sentences: list[str],
    *,
    task: Optional["Task"] = None,
    batch_size: int = 32,
    **kwargs: Any,
) -> np.ndarray:
    """Returns a list of embeddings for the given sentences.

    Args:
        sentences: List of sentences to encode
        task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
            to be used.
        batch_size: Batch size for the encoding
        kwargs: arguments to pass to the models encode method

    Returns:
        Embeddings for the given documents
    """
    ...

seb.interfaces.model.LazyLoadEncoder (Encoder) dataclass

Encoder object, which lazy loads the model on the first call to encode()

Source code in seb/interfaces/model.py
@dataclass
class LazyLoadEncoder(Encoder):
    """Encoder object, which lazy loads the model on the first call to encode()"""

    loader: Callable[[], Encoder]
    _model: Optional[Encoder] = None

    def load_model(self):
        """
        Load the model.
        """
        if self._model is None:
            self._model = self.loader()

    def to(self, device: torch.device):
        self.load_model()
        try:
            self._model = self._model.to(device)  # type: ignore
        except AttributeError:
            logging.debug(f"Model {self._model} does not have a to method")

    @property
    def model(self) -> Encoder:
        """
        Dynimically load the model.
        """
        self.load_model()
        return self._model  # type: ignore

    def encode(
        self,
        sentences: list[str],
        *,
        task: Optional["Task"] = None,
        **kwargs: Any,
    ) -> np.ndarray:
        """
        Returns a list of embeddings for the given sentences.
        Args:
            sentences: List of sentences to encode
            task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
                to be used.
            batch_size: Batch size for the encoding
            kwargs: arguments to pass to the models encode method

        Returns:
            Embeddings for the given documents
        """
        return self.model.encode(sentences, task=task, **kwargs)

    def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray:
        try:
            return self.model.encode_queries(queries, **kwargs)  # type: ignore
        except AttributeError:
            return self.encode(queries, **kwargs)

    def encode_corpus(self, corpus: list[dict[str, str]], **kwargs: Any) -> np.ndarray:
        try:
            return self.model.encode_corpus(corpus, **kwargs)  # type: ignore
        except AttributeError:
            sep = " "
            if isinstance(corpus, dict):
                sentences = [
                    (corpus["title"][i] + sep + corpus["text"][i]).strip() if "title" in corpus else corpus["text"][i].strip()  # type: ignore
                    for i in range(len(corpus["text"]))  # type: ignore
                ]
            else:
                sentences = [(doc["title"] + sep + doc["text"]).strip() if "title" in doc else doc["text"].strip() for doc in corpus]
            return self.encode(sentences, **kwargs)

model: Encoder property readonly

Dynimically load the model.

__init__(self, loader, _model=None) special

Initialize self. See help(type(self)) for accurate signature.

encode(self, sentences, *, task=None, **kwargs)

Returns a list of embeddings for the given sentences.

Parameters:

Name Type Description Default
sentences list

List of sentences to encode

required
task Optional[Task]

The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need to be used.

None
batch_size

Batch size for the encoding

required
kwargs Any

arguments to pass to the models encode method

{}

Returns:

Type Description
ndarray

Embeddings for the given documents

Source code in seb/interfaces/model.py
def encode(
    self,
    sentences: list[str],
    *,
    task: Optional["Task"] = None,
    **kwargs: Any,
) -> np.ndarray:
    """
    Returns a list of embeddings for the given sentences.
    Args:
        sentences: List of sentences to encode
        task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
            to be used.
        batch_size: Batch size for the encoding
        kwargs: arguments to pass to the models encode method

    Returns:
        Embeddings for the given documents
    """
    return self.model.encode(sentences, task=task, **kwargs)

load_model(self)

Load the model.

Source code in seb/interfaces/model.py
def load_model(self):
    """
    Load the model.
    """
    if self._model is None:
        self._model = self.loader()

seb.interfaces.model.SebModel dataclass

An embedding model as implemented in SEB. It notably dynamically loads models (such that models are not loaded when a cache is hit) and includes metadata pertaining to the specific model.

Source code in seb/interfaces/model.py
@dataclass
class SebModel:
    """
    An embedding model as implemented in SEB. It notably dynamically loads models (such that models are not loaded when a cache is hit)
    and includes metadata pertaining to the specific model.
    """

    meta: ModelMeta
    encoder: Encoder

    @property
    def number_of_parameters(self) -> Optional[int]:
        """
        Returns the number of parameters in the model.
        """
        if hasattr(self.encoder, "num_parameters"):
            return sum(p.numel() for p in self.model.parameters() if p.requires_grad)  # type: ignore
        return None

number_of_parameters: Optional[int] property readonly

Returns the number of parameters in the model.

Task Interface

seb.interfaces.task.Task (Protocol)

A task is a specific evaluation task for a sentence embedding model.

Attributes:

Name Type Description
name str

The name of the task.

main_score str

The main score of the task.

reference str

A reference to the task.

version str

The version of the task.

languages list

The languages of the task.

domain list

The domains of the task. Should be one of the categories listed on https://universaldependencies.org

task_type Literal['Classification', 'Retrieval', 'STS', 'BitextMining', 'Clustering', 'Speed']

A list of task types, determines how the task is being evaluated. E.g. Classification.

task_subtypes list

a list of subtypes e.g. Sentiment Classification.

description str

A description of the task.

Source code in seb/interfaces/task.py
@runtime_checkable
class Task(Protocol):
    """
    A task is a specific evaluation task for a sentence embedding model.

    Attributes:
        name: The name of the task.
        main_score: The main score of the task.
        reference: A reference to the task.
        version: The version of the task.
        languages: The languages of the task.
        domain: The domains of the task. Should be one of the categories listed on https://universaldependencies.org
        task_type: A list of task types, determines how the task is being evaluated. E.g. Classification.
        task_subtypes: a list of subtypes e.g. Sentiment Classification.
        description: A description of the task.
    """

    name: str
    main_score: str
    reference: str
    version: str
    languages: list[Language]
    domain: list[Domain]
    task_type: TaskType
    task_subtypes: list[str]
    description: str

    def evaluate(self, model: Encoder) -> TaskResult:
        """
        Evaluates a Sentence Embedding Model on the task.

        Args:
            model: A model with the encode method implemented.

        Returns:
            A TaskResult object.
        """
        ...

    def get_documents(self) -> list[str]:
        """
        Get the documents for the task.

        Returns:
            A list of strings.
        """
        ...

    def get_descriptive_stats(self) -> DescriptiveDatasetStats:
        texts = self.get_documents()
        document_lengths = np.array([len(text) for text in texts])

        mean = float(np.mean(document_lengths))
        std = float(np.std(document_lengths))
        return DescriptiveDatasetStats(
            mean_document_length=mean,
            std_document_length=std,
            num_documents=len(document_lengths),
        )

    def name_to_path(self) -> str:
        """
        Convert a name to a path.
        """
        name = self.name.replace("/", "__").replace(" ", "_")
        return name

evaluate(self, model)

Evaluates a Sentence Embedding Model on the task.

Parameters:

Name Type Description Default
model Encoder

A model with the encode method implemented.

required

Returns:

Type Description
TaskResult

A TaskResult object.

Source code in seb/interfaces/task.py
def evaluate(self, model: Encoder) -> TaskResult:
    """
    Evaluates a Sentence Embedding Model on the task.

    Args:
        model: A model with the encode method implemented.

    Returns:
        A TaskResult object.
    """
    ...

get_documents(self)

Get the documents for the task.

Returns:

Type Description
list

A list of strings.

Source code in seb/interfaces/task.py
def get_documents(self) -> list[str]:
    """
    Get the documents for the task.

    Returns:
        A list of strings.
    """
    ...

name_to_path(self)

Convert a name to a path.

Source code in seb/interfaces/task.py
def name_to_path(self) -> str:
    """
    Convert a name to a path.
    """
    name = self.name.replace("/", "__").replace(" ", "_")
    return name

Data Classes

SEB uses data classes to store the results of a benchmark. The following classes are available:

seb.result_dataclasses.BenchmarkResults (BaseModel)

Dataclass for storing benchmark results.

Attributes:

Name Type Description
meta ModelMeta

ModelMeta object.

task_results list

List of TaskResult objects.

Source code in seb/result_dataclasses.py
class BenchmarkResults(BaseModel):
    """
    Dataclass for storing benchmark results.

    Attributes:
        meta: ModelMeta object.
        task_results: List of TaskResult objects.
    """

    meta: ModelMeta
    task_results: list[Union[TaskResult, TaskError]]

    def get_main_score(self, lang: Optional[Iterable[Language]] = None) -> float:
        scores = [t.get_main_score(lang) for t in self.task_results]
        if scores:
            return sum(scores) / len(scores)
        return np.nan

    def __iter__(self) -> Iterator[Union[TaskResult, TaskError]]:  # type: ignore
        return iter(self.task_results)

    def __getitem__(self, index: int) -> Union[TaskResult, TaskError]:
        return self.task_results[index]

    def __len__(self) -> int:
        return len(self.task_results)

    def to_disk(self, path: Path) -> None:
        """
        Write task results to a path.
        """
        if path.is_file():
            raise ValueError("Can't save BenchmarkResults to a file. Path must be a directory.")
        path.mkdir(parents=True, exist_ok=True)
        for task_result in self.task_results:
            if isinstance(task_result, TaskResult):
                task_result.to_disk(path / f"{task_result.task_name}.json")
            else:
                task_result.to_disk(path / f"{task_result.task_name}.error.json")

        meta_path = path / "meta.json"
        self.meta.to_disk(meta_path)

    @classmethod
    def from_disk(cls, path: Path) -> "BenchmarkResults":
        """
        Load task results from a path.
        """
        if not path.is_dir():
            raise ValueError("Can't load BenchmarkResults from path: {path}. Path must be a directory.")
        task_results = []
        for file in path.glob("*.json"):
            if file.stem == "meta":
                continue
            if file.stem.endswith(".error"):
                task_results.append(TaskError.from_disk(file))
            else:
                task_results.append(TaskResult.from_disk(file))

        meta_path = path / "meta.json"
        meta = ModelMeta.from_disk(meta_path)
        return cls(meta=meta, task_results=task_results)

__class_vars__ special

The names of the class variables defined on the model.

__private_attributes__ special

Metadata about the private attributes of the model.

__pydantic_complete__ special

Whether model building is completed, or if there are still undefined fields.

__pydantic_custom_init__ special

Whether the model has a custom __init__ method.

__pydantic_decorators__ special

Metadata containing the decorators defined on the model. This replaces Model.__validators__ and Model.__root_validators__ from Pydantic V1.

__pydantic_generic_metadata__ special

Metadata for generic models; contains data used for a similar purpose to args, origin, parameters in typing-module generics. May eventually be replaced by these.

__pydantic_parent_namespace__ special

Parent namespace of the model, used for automatic rebuilding of models.

__pydantic_post_init__ special

The name of the post-init method for the model, if defined.

__signature__ special

The synthesized __init__ [Signature][inspect.Signature] of the model.

model_computed_fields

A dictionary of computed field names and their corresponding ComputedFieldInfo objects.

model_config

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo] objects.

This replaces Model.__fields__ from Pydantic V1.

from_disk(path) classmethod

Load task results from a path.

Source code in seb/result_dataclasses.py
@classmethod
def from_disk(cls, path: Path) -> "BenchmarkResults":
    """
    Load task results from a path.
    """
    if not path.is_dir():
        raise ValueError("Can't load BenchmarkResults from path: {path}. Path must be a directory.")
    task_results = []
    for file in path.glob("*.json"):
        if file.stem == "meta":
            continue
        if file.stem.endswith(".error"):
            task_results.append(TaskError.from_disk(file))
        else:
            task_results.append(TaskResult.from_disk(file))

    meta_path = path / "meta.json"
    meta = ModelMeta.from_disk(meta_path)
    return cls(meta=meta, task_results=task_results)

to_disk(self, path)

Write task results to a path.

Source code in seb/result_dataclasses.py
def to_disk(self, path: Path) -> None:
    """
    Write task results to a path.
    """
    if path.is_file():
        raise ValueError("Can't save BenchmarkResults to a file. Path must be a directory.")
    path.mkdir(parents=True, exist_ok=True)
    for task_result in self.task_results:
        if isinstance(task_result, TaskResult):
            task_result.to_disk(path / f"{task_result.task_name}.json")
        else:
            task_result.to_disk(path / f"{task_result.task_name}.error.json")

    meta_path = path / "meta.json"
    self.meta.to_disk(meta_path)

seb.result_dataclasses.TaskResult (BaseModel)

Dataclass for storing task results.

Attributes:

Name Type Description
task_name str

Name of the task.

task_description str

Description of the task.

task_version str

Version of the task.

time_of_run datetime

Time of the run.

scores dict

Dictionary of scores on the form {language: {"metric": value}}.

main_score str

Name of the main score.

Source code in seb/result_dataclasses.py
class TaskResult(BaseModel):
    """
    Dataclass for storing task results.

    Attributes:
        task_name: Name of the task.
        task_description: Description of the task.
        task_version: Version of the task.
        time_of_run: Time of the run.
        scores: Dictionary of scores on the form {language: {"metric": value}}.
        main_score: Name of the main score.
    """

    task_name: str
    task_description: str
    task_version: str
    time_of_run: datetime
    scores: dict[Language, dict[str, Union[float, str]]]  # {language: {"metric": value}}.
    main_score: str

    def get_main_score(self, lang: Optional[Iterable[str]] = None) -> float:
        """
        Returns the main score for a given set of languages.

        Args:
            lang: List of languages to get the main score for.

        Returns:
            The main score.
        """
        main_scores = []
        if lang is None:
            lang = self.scores.keys()

        for l in lang:
            main_scores.append(self.scores[l][self.main_score])  # type: ignore

        return sum(main_scores) / len(main_scores)

    @property
    def languages(self) -> list[Language]:
        """
        Returns the languages of the task.
        """
        return list(self.scores.keys())

    @classmethod
    def from_disk(cls, path: Path) -> "TaskResult":
        """
        Load task results from a path.
        """
        with path.open("r") as f:
            task_results = json.load(f)
        return cls(**task_results)

    def to_disk(self, path: Path) -> None:
        """
        Write task results to a path.
        """
        path.parent.mkdir(parents=True, exist_ok=True)
        json_str: str = self.model_dump_json()  # type: ignore

        with path.open("w") as f:
            f.write(json_str)

    def name_to_path(self) -> str:
        """
        Convert a name to a path.
        """
        name = self.task_name.replace("/", "__").replace(" ", "_")
        return name

__class_vars__ special

The names of the class variables defined on the model.

__private_attributes__ special

Metadata about the private attributes of the model.

__pydantic_complete__ special

Whether model building is completed, or if there are still undefined fields.

__pydantic_custom_init__ special

Whether the model has a custom __init__ method.

__pydantic_decorators__ special

Metadata containing the decorators defined on the model. This replaces Model.__validators__ and Model.__root_validators__ from Pydantic V1.

__pydantic_generic_metadata__ special

Metadata for generic models; contains data used for a similar purpose to args, origin, parameters in typing-module generics. May eventually be replaced by these.

__pydantic_parent_namespace__ special

Parent namespace of the model, used for automatic rebuilding of models.

__pydantic_post_init__ special

The name of the post-init method for the model, if defined.

__signature__ special

The synthesized __init__ [Signature][inspect.Signature] of the model.

languages: list property readonly

Returns the languages of the task.

model_computed_fields

A dictionary of computed field names and their corresponding ComputedFieldInfo objects.

model_config

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo] objects.

This replaces Model.__fields__ from Pydantic V1.

from_disk(path) classmethod

Load task results from a path.

Source code in seb/result_dataclasses.py
@classmethod
def from_disk(cls, path: Path) -> "TaskResult":
    """
    Load task results from a path.
    """
    with path.open("r") as f:
        task_results = json.load(f)
    return cls(**task_results)

get_main_score(self, lang=None)

Returns the main score for a given set of languages.

Parameters:

Name Type Description Default
lang Optional[collections.abc.Iterable[str]]

List of languages to get the main score for.

None

Returns:

Type Description
float

The main score.

Source code in seb/result_dataclasses.py
def get_main_score(self, lang: Optional[Iterable[str]] = None) -> float:
    """
    Returns the main score for a given set of languages.

    Args:
        lang: List of languages to get the main score for.

    Returns:
        The main score.
    """
    main_scores = []
    if lang is None:
        lang = self.scores.keys()

    for l in lang:
        main_scores.append(self.scores[l][self.main_score])  # type: ignore

    return sum(main_scores) / len(main_scores)

name_to_path(self)

Convert a name to a path.

Source code in seb/result_dataclasses.py
def name_to_path(self) -> str:
    """
    Convert a name to a path.
    """
    name = self.task_name.replace("/", "__").replace(" ", "_")
    return name

to_disk(self, path)

Write task results to a path.

Source code in seb/result_dataclasses.py
def to_disk(self, path: Path) -> None:
    """
    Write task results to a path.
    """
    path.parent.mkdir(parents=True, exist_ok=True)
    json_str: str = self.model_dump_json()  # type: ignore

    with path.open("w") as f:
        f.write(json_str)

seb.result_dataclasses.TaskError (BaseModel)

Source code in seb/result_dataclasses.py
class TaskError(BaseModel):
    task_name: str
    error: str
    time_of_run: datetime
    languages: list[str] = []

    def to_disk(self, path: Path) -> None:
        """
        Write task results to a path.
        """
        path.parent.mkdir(parents=True, exist_ok=True)
        json_str: str = self.model_dump_json()  # type: ignore

        with path.open("w") as f:
            f.write(json_str)

    @classmethod
    def from_disk(cls, path: Path) -> "TaskError":
        """
        Load task results from a path.
        """
        with path.open() as f:
            task_results = json.load(f)
        return cls(**task_results)

    @staticmethod
    def get_main_score(lang: Optional[Iterable[str]] = None) -> float:  # noqa: ARG004
        return np.nan

    def name_to_path(self) -> str:
        """
        Convert a name to a path.
        """
        name = self.task_name.replace("/", "__").replace(" ", "_")
        return name

__class_vars__ special

The names of the class variables defined on the model.

__private_attributes__ special

Metadata about the private attributes of the model.

__pydantic_complete__ special

Whether model building is completed, or if there are still undefined fields.

__pydantic_custom_init__ special

Whether the model has a custom __init__ method.

__pydantic_decorators__ special

Metadata containing the decorators defined on the model. This replaces Model.__validators__ and Model.__root_validators__ from Pydantic V1.

__pydantic_generic_metadata__ special

Metadata for generic models; contains data used for a similar purpose to args, origin, parameters in typing-module generics. May eventually be replaced by these.

__pydantic_parent_namespace__ special

Parent namespace of the model, used for automatic rebuilding of models.

__pydantic_post_init__ special

The name of the post-init method for the model, if defined.

__signature__ special

The synthesized __init__ [Signature][inspect.Signature] of the model.

model_computed_fields

A dictionary of computed field names and their corresponding ComputedFieldInfo objects.

model_config

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields

Metadata about the fields defined on the model, mapping of field names to [FieldInfo][pydantic.fields.FieldInfo] objects.

This replaces Model.__fields__ from Pydantic V1.

from_disk(path) classmethod

Load task results from a path.

Source code in seb/result_dataclasses.py
@classmethod
def from_disk(cls, path: Path) -> "TaskError":
    """
    Load task results from a path.
    """
    with path.open() as f:
        task_results = json.load(f)
    return cls(**task_results)

name_to_path(self)

Convert a name to a path.

Source code in seb/result_dataclasses.py
def name_to_path(self) -> str:
    """
    Convert a name to a path.
    """
    name = self.task_name.replace("/", "__").replace(" ", "_")
    return name

to_disk(self, path)

Write task results to a path.

Source code in seb/result_dataclasses.py
def to_disk(self, path: Path) -> None:
    """
    Write task results to a path.
    """
    path.parent.mkdir(parents=True, exist_ok=True)
    json_str: str = self.model_dump_json()  # type: ignore

    with path.open("w") as f:
        f.write(json_str)