API¶

General¶

General function for dealing with tasks and models implemented in SEB.

`seb.get_task(name)` ¶

Fetches a task by name.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the task.	required

Returns:

Type	Description
`Task`	A task.

Source code in src/seb/registries.py

def get_task(name: str) -> Task:
    """
    Fetches a task by name.

    Args:
        name: The name of the task.

    Returns:
        A task.
    """
    return tasks.get(name)()

`seb.get_all_tasks()` ¶

Returns all tasks implemented in SEB.

Returns:

Type	Description
`list[Task]`	A list of all tasks in SEB.

Source code in src/seb/registries.py

def get_all_tasks() -> list[Task]:
    """
    Returns all tasks implemented in SEB.

    Returns:
        A list of all tasks in SEB.
    """
    return [get_task(task_name) for task_name in tasks.get_all()]

`seb.get_model(name)` ¶

Fetches a model by name.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the model.	required

Returns:

Type	Description
`SebModel`	A model including metadata.

Source code in src/seb/registries.py

def get_model(name: str) -> SebModel:
    """
    Fetches a model by name.

    Args:
        name: The name of the model.

    Returns:
        A model including metadata.
    """
    return models.get(name)()

`seb.get_all_models()` ¶

Get all the models implemented in SEB.

Returns:

Type	Description
`list[SebModel]`	A list of all models in SEB.

Source code in src/seb/registries.py

def get_all_models() -> list[SebModel]:
    """
    Get all the models implemented in SEB.

    Returns:
        A list of all models in SEB.
    """
    return [get_model(model_name) for model_name in models.get_all()]

Benchmark¶

`seb.Benchmark` ¶

Benchmark is the main orchestrator of the SEB benchmark.

Source code in src/seb/benchmark.py

class Benchmark:
    """
    Benchmark is the main orchestrator of the SEB benchmark.
    """

    def __init__(
        self,
        languages: Optional[list[str]] = None,
        tasks: Optional[Union[Iterable[str], Iterable[Task]]] = None,
    ) -> None:
        """
        Initialize the benchmark.

        Args:
            languages: A list of languages to run the benchmark on. If None, all languages are used.
            tasks: The tasks to run the benchmark on. If None, all tasks are used. Can either be specified as strings or as Task objects.
        """
        self.languages = languages

        self.tasks = self.get_tasks(tasks, languages)
        self.task_names = [task.name for task in self.tasks]

    @staticmethod
    def get_tasks(
        tasks: Optional[Union[Iterable[str], Iterable[Task]]],
        languages: Optional[list[str]],
    ) -> list[Task]:
        """
        Get the tasks for the benchmark.

        Returns:
            A list of tasks.
        """
        _tasks = []

        if tasks is None:
            _tasks = get_all_tasks()
        else:
            for task in tasks:
                if isinstance(task, str):
                    _tasks.append(get_task(task))
                elif isinstance(task, Task):
                    _tasks.append(task)
                else:
                    raise ValueError(f"Invalid task type: {type(task)}")

        if languages is not None:
            langs = set(languages)
            _tasks = [task for task in _tasks if set(task.languages) & langs]

        return _tasks

    def evaluate_model(
        self,
        model: SebModel,
        use_cache: bool = True,
        run_model: bool = True,
        raise_errors: bool = True,
        cache_dir: Optional[Path] = None,
        verbose: bool = True,
    ) -> BenchmarkResults:
        """
        Evaluate a model on the benchmark.

        Args:
            model: The model to evaluate.
            use_cache: Whether to use the cache.
            run_model: Whether to run the model if the cache is not present.
            raise_errors: Whether to raise errors.
            cache_dir: The cache directory to use. If None, the default cache directory is used.
            verbose: Whether to show a progress bar.

        Returns:
            The results of the benchmark.
        """
        task_results = []
        pbar = tqdm(
            self.tasks,
            position=1,
            desc=f"Running {model.meta.name}",
            leave=False,
            disable=not verbose,
        )
        for task in pbar:
            pbar.set_description(f"Running {model.meta.name} on {task.name}")
            task_result = run_task(
                task,
                model,
                use_cache=use_cache,
                run_model=run_model,
                raise_errors=raise_errors,
                cache_dir=cache_dir,
            )
            task_results.append(task_result)

        return BenchmarkResults(meta=model.meta, task_results=task_results)

    def evaluate_models(
        self,
        models: list[SebModel],
        use_cache: bool = True,
        run_model: bool = True,
        raise_errors: bool = True,
        cache_dir: Optional[Path] = None,
        verbose: bool = True,
    ) -> list[BenchmarkResults]:
        """
        Evaluate a list of models on the benchmark.

        Args:
            models: The models to evaluate.
            use_cache: Whether to use the cache.
            run_model: Whether to run the model if the cache is not present.
            raise_errors: Whether to raise errors.
            cache_dir: The cache directory to use. If None, the default cache directory is used.
            verbose: Whether to show a progress bar.

        Returns:
            The results of the benchmark, once for each model.
        """
        results = []
        pbar = tqdm(
            models,
            position=0,
            desc="Running Benchmark",
            leave=True,
            disable=not verbose,
        )

        for model in pbar:
            pbar.set_description(f"Running {model.meta.name}")
            results.append(
                self.evaluate_model(
                    model,
                    use_cache=use_cache,
                    run_model=run_model,
                    raise_errors=raise_errors,
                    cache_dir=cache_dir,
                    verbose=verbose,
                ),
            )
        return results

`init(languages=None, tasks=None)` ¶

Initialize the benchmark.

Parameters:

Name	Type	Description	Default
`languages`	`Optional[list[str]]`	A list of languages to run the benchmark on. If None, all languages are used.	`None`
`tasks`	`Optional[Union[Iterable[str], Iterable[Task]]]`	The tasks to run the benchmark on. If None, all tasks are used. Can either be specified as strings or as Task objects.	`None`

Source code in src/seb/benchmark.py

def __init__(
    self,
    languages: Optional[list[str]] = None,
    tasks: Optional[Union[Iterable[str], Iterable[Task]]] = None,
) -> None:
    """
    Initialize the benchmark.

    Args:
        languages: A list of languages to run the benchmark on. If None, all languages are used.
        tasks: The tasks to run the benchmark on. If None, all tasks are used. Can either be specified as strings or as Task objects.
    """
    self.languages = languages

    self.tasks = self.get_tasks(tasks, languages)
    self.task_names = [task.name for task in self.tasks]

`evaluate_model(model, use_cache=True, run_model=True, raise_errors=True, cache_dir=None, verbose=True)` ¶

Evaluate a model on the benchmark.

Parameters:

Name	Type	Description	Default
`model`	`SebModel`	The model to evaluate.	required
`use_cache`	`bool`	Whether to use the cache.	`True`
`run_model`	`bool`	Whether to run the model if the cache is not present.	`True`
`raise_errors`	`bool`	Whether to raise errors.	`True`
`cache_dir`	`Optional[Path]`	The cache directory to use. If None, the default cache directory is used.	`None`
`verbose`	`bool`	Whether to show a progress bar.	`True`

Returns:

Type	Description
`BenchmarkResults`	The results of the benchmark.

Source code in src/seb/benchmark.py

def evaluate_model(
    self,
    model: SebModel,
    use_cache: bool = True,
    run_model: bool = True,
    raise_errors: bool = True,
    cache_dir: Optional[Path] = None,
    verbose: bool = True,
) -> BenchmarkResults:
    """
    Evaluate a model on the benchmark.

    Args:
        model: The model to evaluate.
        use_cache: Whether to use the cache.
        run_model: Whether to run the model if the cache is not present.
        raise_errors: Whether to raise errors.
        cache_dir: The cache directory to use. If None, the default cache directory is used.
        verbose: Whether to show a progress bar.

    Returns:
        The results of the benchmark.
    """
    task_results = []
    pbar = tqdm(
        self.tasks,
        position=1,
        desc=f"Running {model.meta.name}",
        leave=False,
        disable=not verbose,
    )
    for task in pbar:
        pbar.set_description(f"Running {model.meta.name} on {task.name}")
        task_result = run_task(
            task,
            model,
            use_cache=use_cache,
            run_model=run_model,
            raise_errors=raise_errors,
            cache_dir=cache_dir,
        )
        task_results.append(task_result)

    return BenchmarkResults(meta=model.meta, task_results=task_results)

`evaluate_models(models, use_cache=True, run_model=True, raise_errors=True, cache_dir=None, verbose=True)` ¶

Evaluate a list of models on the benchmark.

Parameters:

Name	Type	Description	Default
`models`	`list[SebModel]`	The models to evaluate.	required
`use_cache`	`bool`	Whether to use the cache.	`True`
`run_model`	`bool`	Whether to run the model if the cache is not present.	`True`
`raise_errors`	`bool`	Whether to raise errors.	`True`
`cache_dir`	`Optional[Path]`	The cache directory to use. If None, the default cache directory is used.	`None`
`verbose`	`bool`	Whether to show a progress bar.	`True`

Returns:

Type	Description
`list[BenchmarkResults]`	The results of the benchmark, once for each model.

Source code in src/seb/benchmark.py

def evaluate_models(
    self,
    models: list[SebModel],
    use_cache: bool = True,
    run_model: bool = True,
    raise_errors: bool = True,
    cache_dir: Optional[Path] = None,
    verbose: bool = True,
) -> list[BenchmarkResults]:
    """
    Evaluate a list of models on the benchmark.

    Args:
        models: The models to evaluate.
        use_cache: Whether to use the cache.
        run_model: Whether to run the model if the cache is not present.
        raise_errors: Whether to raise errors.
        cache_dir: The cache directory to use. If None, the default cache directory is used.
        verbose: Whether to show a progress bar.

    Returns:
        The results of the benchmark, once for each model.
    """
    results = []
    pbar = tqdm(
        models,
        position=0,
        desc="Running Benchmark",
        leave=True,
        disable=not verbose,
    )

    for model in pbar:
        pbar.set_description(f"Running {model.meta.name}")
        results.append(
            self.evaluate_model(
                model,
                use_cache=use_cache,
                run_model=run_model,
                raise_errors=raise_errors,
                cache_dir=cache_dir,
                verbose=verbose,
            ),
        )
    return results

`get_tasks(tasks, languages)` `staticmethod` ¶

Get the tasks for the benchmark.

Returns:

Type	Description
`list[Task]`	A list of tasks.

Source code in src/seb/benchmark.py

@staticmethod
def get_tasks(
    tasks: Optional[Union[Iterable[str], Iterable[Task]]],
    languages: Optional[list[str]],
) -> list[Task]:
    """
    Get the tasks for the benchmark.

    Returns:
        A list of tasks.
    """
    _tasks = []

    if tasks is None:
        _tasks = get_all_tasks()
    else:
        for task in tasks:
            if isinstance(task, str):
                _tasks.append(get_task(task))
            elif isinstance(task, Task):
                _tasks.append(task)
            else:
                raise ValueError(f"Invalid task type: {type(task)}")

    if languages is not None:
        langs = set(languages)
        _tasks = [task for task in _tasks if set(task.languages) & langs]

    return _tasks

Interfaces¶

SEB implements to main interfaces. A task interface which is a tasks within the Benchmark and a model interface which is a model applied to the tasks.

Model Interface¶

`seb.Encoder` ¶

Bases: Protocol

Interface which all models must implement.

Source code in src/seb/interfaces/model.py

@runtime_checkable
class Encoder(Protocol):
    """
    Interface which all models must implement.
    """

    def encode(
        self,
        sentences: list[str],
        *,
        task: Optional["Task"] = None,
        batch_size: int = 32,
        **kwargs: Any,
    ) -> np.ndarray:
        """Returns a list of embeddings for the given sentences.

        Args:
            sentences: List of sentences to encode
            task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
                to be used.
            batch_size: Batch size for the encoding
            kwargs: arguments to pass to the models encode method

        Returns:
            Embeddings for the given documents
        """
        ...

`encode(sentences, *, task=None, batch_size=32, **kwargs)` ¶

Returns a list of embeddings for the given sentences.

Parameters:

Name	Type	Description	Default
`sentences`	`list[str]`	List of sentences to encode	required
`task`	`Optional[Task]`	The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need to be used.	`None`
`batch_size`	`int`	Batch size for the encoding	`32`
`kwargs`	`Any`	arguments to pass to the models encode method	`{}`

Returns:

Type	Description
`ndarray`	Embeddings for the given documents

Source code in src/seb/interfaces/model.py

def encode(
    self,
    sentences: list[str],
    *,
    task: Optional["Task"] = None,
    batch_size: int = 32,
    **kwargs: Any,
) -> np.ndarray:
    """Returns a list of embeddings for the given sentences.

    Args:
        sentences: List of sentences to encode
        task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
            to be used.
        batch_size: Batch size for the encoding
        kwargs: arguments to pass to the models encode method

    Returns:
        Embeddings for the given documents
    """
    ...

`seb.LazyLoadEncoder` `dataclass` ¶

Bases: Encoder

Encoder object, which lazy loads the model on the first call to encode()

Source code in src/seb/interfaces/model.py

@dataclass
class LazyLoadEncoder(Encoder):
    """Encoder object, which lazy loads the model on the first call to encode()"""

    loader: Callable[[], Encoder]
    _model: Optional[Encoder] = None

    def load_model(self):
        """
        Load the model.
        """
        if self._model is None:
            self._model = self.loader()

    def to(self, device: torch.device):
        self.load_model()
        try:
            self._model = self._model.to(device)  # type: ignore
        except AttributeError:
            logging.debug(f"Model {self._model} does not have a to method")

    @property
    def model(self) -> Encoder:
        """
        Dynimically load the model.
        """
        self.load_model()
        return self._model  # type: ignore

    def encode(
        self,
        sentences: list[str],
        *,
        task: Optional["Task"] = None,
        **kwargs: Any,
    ) -> np.ndarray:
        """
        Returns a list of embeddings for the given sentences.
        Args:
            sentences: List of sentences to encode
            task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
                to be used.
            batch_size: Batch size for the encoding
            kwargs: arguments to pass to the models encode method

        Returns:
            Embeddings for the given documents
        """
        return self.model.encode(sentences, task=task, **kwargs)

    def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray:
        try:
            return self.model.encode_queries(queries, **kwargs)  # type: ignore
        except AttributeError:
            return self.encode(queries, **kwargs)

    def encode_corpus(self, corpus: list[dict[str, str]], **kwargs: Any) -> np.ndarray:
        try:
            return self.model.encode_corpus(corpus, **kwargs)  # type: ignore
        except AttributeError:
            sep = " "
            if isinstance(corpus, dict):
                sentences = [
                    (corpus["title"][i] + sep + corpus["text"][i]).strip() if "title" in corpus else corpus["text"][i].strip()  # type: ignore
                    for i in range(len(corpus["text"]))  # type: ignore
                ]
            else:
                sentences = [(doc["title"] + sep + doc["text"]).strip() if "title" in doc else doc["text"].strip() for doc in corpus]
            return self.encode(sentences, **kwargs)

`model: Encoder` `property` ¶

Dynimically load the model.

`encode(sentences, *, task=None, **kwargs)` ¶

Returns a list of embeddings for the given sentences. Args: sentences: List of sentences to encode task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need to be used. batch_size: Batch size for the encoding kwargs: arguments to pass to the models encode method

Returns:

Type	Description
`ndarray`	Embeddings for the given documents

Source code in src/seb/interfaces/model.py

def encode(
    self,
    sentences: list[str],
    *,
    task: Optional["Task"] = None,
    **kwargs: Any,
) -> np.ndarray:
    """
    Returns a list of embeddings for the given sentences.
    Args:
        sentences: List of sentences to encode
        task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
            to be used.
        batch_size: Batch size for the encoding
        kwargs: arguments to pass to the models encode method

    Returns:
        Embeddings for the given documents
    """
    return self.model.encode(sentences, task=task, **kwargs)

`load_model()` ¶

Load the model.

Source code in src/seb/interfaces/model.py

def load_model(self):
    """
    Load the model.
    """
    if self._model is None:
        self._model = self.loader()

`seb.SebModel` `dataclass` ¶

An embedding model as implemented in SEB. It notably dynamically loads models (such that models are not loaded when a cache is hit) and includes metadata pertaining to the specific model.

Source code in src/seb/interfaces/model.py

@dataclass
class SebModel:
    """
    An embedding model as implemented in SEB. It notably dynamically loads models (such that models are not loaded when a cache is hit)
    and includes metadata pertaining to the specific model.
    """

    meta: ModelMeta
    encoder: Encoder

    @property
    def number_of_parameters(self) -> Optional[int]:
        """
        Returns the number of parameters in the model.
        """
        if hasattr(self.encoder, "num_parameters"):
            return sum(p.numel() for p in self.model.parameters() if p.requires_grad)  # type: ignore
        return None

`number_of_parameters: Optional[int]` `property` ¶

Returns the number of parameters in the model.

Task Interface¶

`seb.Task` ¶

Bases: Protocol

A task is a specific evaluation task for a sentence embedding model.

Attributes:

Name	Type	Description
`name`	`str`	The name of the task.
`main_score`	`str`	The main score of the task.
`reference`	`str`	A reference to the task.
`version`	`str`	The version of the task.
`languages`	`list[Language]`	The languages of the task.
`domain`	`list[Domain]`	The domains of the task. Should be one of the categories listed on https://universaldependencies.org
`task_type`	`TaskType`	A list of task types, determines how the task is being evaluated. E.g. Classification.
`task_subtypes`	`list[str]`	a list of subtypes e.g. Sentiment Classification.
`description`	`str`	A description of the task.

Source code in src/seb/interfaces/task.py

@runtime_checkable
class Task(Protocol):
    """
    A task is a specific evaluation task for a sentence embedding model.

    Attributes:
        name: The name of the task.
        main_score: The main score of the task.
        reference: A reference to the task.
        version: The version of the task.
        languages: The languages of the task.
        domain: The domains of the task. Should be one of the categories listed on https://universaldependencies.org
        task_type: A list of task types, determines how the task is being evaluated. E.g. Classification.
        task_subtypes: a list of subtypes e.g. Sentiment Classification.
        description: A description of the task.
    """

    name: str
    main_score: str
    reference: str
    version: str
    languages: list[Language]
    domain: list[Domain]
    task_type: TaskType
    task_subtypes: list[str]
    description: str

    def evaluate(self, model: Encoder) -> TaskResult:
        """
        Evaluates a Sentence Embedding Model on the task.

        Args:
            model: A model with the encode method implemented.

        Returns:
            A TaskResult object.
        """
        ...

    def get_documents(self) -> list[str]:
        """
        Get the documents for the task.

        Returns:
            A list of strings.
        """
        ...

    def get_descriptive_stats(self) -> DescriptiveDatasetStats:
        texts = self.get_documents()
        document_lengths = np.array([len(text) for text in texts])

        mean = float(np.mean(document_lengths))
        std = float(np.std(document_lengths))
        return DescriptiveDatasetStats(
            mean_document_length=mean,
            std_document_length=std,
            num_documents=len(document_lengths),
        )

    def name_to_path(self) -> str:
        """
        Convert a name to a path.
        """
        name = self.name.replace("/", "__").replace(" ", "_")
        return name

`evaluate(model)` ¶

Evaluates a Sentence Embedding Model on the task.

Parameters:

Name	Type	Description	Default
`model`	`Encoder`	A model with the encode method implemented.	required

Returns:

Type	Description
`TaskResult`	A TaskResult object.

Source code in src/seb/interfaces/task.py

def evaluate(self, model: Encoder) -> TaskResult:
    """
    Evaluates a Sentence Embedding Model on the task.

    Args:
        model: A model with the encode method implemented.

    Returns:
        A TaskResult object.
    """
    ...

`get_documents()` ¶

Get the documents for the task.

Returns:

Type	Description
`list[str]`	A list of strings.

Source code in src/seb/interfaces/task.py

def get_documents(self) -> list[str]:
    """
    Get the documents for the task.

    Returns:
        A list of strings.
    """
    ...

`name_to_path()` ¶

Convert a name to a path.

Source code in src/seb/interfaces/task.py

def name_to_path(self) -> str:
    """
    Convert a name to a path.
    """
    name = self.name.replace("/", "__").replace(" ", "_")
    return name

Data Classes¶

SEB uses data classes to store the results of a benchmark. The following classes are available:

`seb.BenchmarkResults` ¶

Bases: BaseModel

Dataclass for storing benchmark results.

Attributes:

Name	Type	Description
`meta`	`ModelMeta`	ModelMeta object.
`task_results`	`list[Union[TaskResult, TaskError]]`	List of TaskResult objects.

Source code in src/seb/result_dataclasses.py

class BenchmarkResults(BaseModel):
    """
    Dataclass for storing benchmark results.

    Attributes:
        meta: ModelMeta object.
        task_results: List of TaskResult objects.
    """

    meta: ModelMeta
    task_results: list[Union[TaskResult, TaskError]]

    def get_main_score(self, lang: Optional[Iterable[Language]] = None) -> float:
        scores = [t.get_main_score(lang) for t in self.task_results]
        if scores:
            return sum(scores) / len(scores)
        return np.nan

    def __iter__(self) -> Iterator[Union[TaskResult, TaskError]]:  # type: ignore
        return iter(self.task_results)

    def __getitem__(self, index: int) -> Union[TaskResult, TaskError]:
        return self.task_results[index]

    def __len__(self) -> int:
        return len(self.task_results)

    def to_disk(self, path: Path) -> None:
        """
        Write task results to a path.
        """
        if path.is_file():
            raise ValueError("Can't save BenchmarkResults to a file. Path must be a directory.")
        path.mkdir(parents=True, exist_ok=True)
        for task_result in self.task_results:
            if isinstance(task_result, TaskResult):
                task_result.to_disk(path / f"{task_result.task_name}.json")
            else:
                task_result.to_disk(path / f"{task_result.task_name}.error.json")

        meta_path = path / "meta.json"
        self.meta.to_disk(meta_path)

    @classmethod
    def from_disk(cls, path: Path) -> "BenchmarkResults":
        """
        Load task results from a path.
        """
        if not path.is_dir():
            raise ValueError("Can't load BenchmarkResults from path: {path}. Path must be a directory.")
        task_results = []
        for file in path.glob("*.json"):
            if file.stem == "meta":
                continue
            if file.stem.endswith(".error"):
                task_results.append(TaskError.from_disk(file))
            else:
                task_results.append(TaskResult.from_disk(file))

        meta_path = path / "meta.json"
        meta = ModelMeta.from_disk(meta_path)
        return cls(meta=meta, task_results=task_results)

`from_disk(path)` `classmethod` ¶

Load task results from a path.

Source code in src/seb/result_dataclasses.py

@classmethod
def from_disk(cls, path: Path) -> "BenchmarkResults":
    """
    Load task results from a path.
    """
    if not path.is_dir():
        raise ValueError("Can't load BenchmarkResults from path: {path}. Path must be a directory.")
    task_results = []
    for file in path.glob("*.json"):
        if file.stem == "meta":
            continue
        if file.stem.endswith(".error"):
            task_results.append(TaskError.from_disk(file))
        else:
            task_results.append(TaskResult.from_disk(file))

    meta_path = path / "meta.json"
    meta = ModelMeta.from_disk(meta_path)
    return cls(meta=meta, task_results=task_results)

`to_disk(path)` ¶

Write task results to a path.

Source code in src/seb/result_dataclasses.py

def to_disk(self, path: Path) -> None:
    """
    Write task results to a path.
    """
    if path.is_file():
        raise ValueError("Can't save BenchmarkResults to a file. Path must be a directory.")
    path.mkdir(parents=True, exist_ok=True)
    for task_result in self.task_results:
        if isinstance(task_result, TaskResult):
            task_result.to_disk(path / f"{task_result.task_name}.json")
        else:
            task_result.to_disk(path / f"{task_result.task_name}.error.json")

    meta_path = path / "meta.json"
    self.meta.to_disk(meta_path)

`seb.TaskResult` ¶

Bases: BaseModel

Dataclass for storing task results.

Attributes:

Name	Type	Description
`task_name`	`str`	Name of the task.
`task_description`	`str`	Description of the task.
`task_version`	`str`	Version of the task.
`time_of_run`	`datetime`	Time of the run.
`scores`	`dict[Language, dict[str, Union[float, str]]]`	Dictionary of scores on the form {language: {"metric": value}}.
`main_score`	`str`	Name of the main score.

Source code in src/seb/result_dataclasses.py

class TaskResult(BaseModel):
    """
    Dataclass for storing task results.

    Attributes:
        task_name: Name of the task.
        task_description: Description of the task.
        task_version: Version of the task.
        time_of_run: Time of the run.
        scores: Dictionary of scores on the form {language: {"metric": value}}.
        main_score: Name of the main score.
    """

    task_name: str
    task_description: str
    task_version: str
    time_of_run: datetime
    scores: dict[Language, dict[str, Union[float, str]]]  # {language: {"metric": value}}.
    main_score: str

    def get_main_score(self, lang: Optional[Iterable[str]] = None) -> float:
        """
        Returns the main score for a given set of languages.

        Args:
            lang: List of languages to get the main score for.

        Returns:
            The main score.
        """
        main_scores = []
        if lang is None:
            lang = self.scores.keys()

        for l in lang:
            main_scores.append(self.scores[l][self.main_score])  # type: ignore

        return sum(main_scores) / len(main_scores)

    @property
    def languages(self) -> list[Language]:
        """
        Returns the languages of the task.
        """
        return list(self.scores.keys())

    @classmethod
    def from_disk(cls, path: Path) -> "TaskResult":
        """
        Load task results from a path.
        """
        with path.open("r") as f:
            task_results = json.load(f)
        return cls(**task_results)

    def to_disk(self, path: Path) -> None:
        """
        Write task results to a path.
        """
        path.parent.mkdir(parents=True, exist_ok=True)
        json_str: str = self.model_dump_json()  # type: ignore

        with path.open("w") as f:
            f.write(json_str)

    def name_to_path(self) -> str:
        """
        Convert a name to a path.
        """
        name = self.task_name.replace("/", "__").replace(" ", "_")
        return name

`languages: list[Language]` `property` ¶

Returns the languages of the task.

`from_disk(path)` `classmethod` ¶

Load task results from a path.

Source code in src/seb/result_dataclasses.py

@classmethod
def from_disk(cls, path: Path) -> "TaskResult":
    """
    Load task results from a path.
    """
    with path.open("r") as f:
        task_results = json.load(f)
    return cls(**task_results)

`get_main_score(lang=None)` ¶

Returns the main score for a given set of languages.

Parameters:

Name	Type	Description	Default
`lang`	`Optional[Iterable[str]]`	List of languages to get the main score for.	`None`

Returns:

Type	Description
`float`	The main score.

Source code in src/seb/result_dataclasses.py

def get_main_score(self, lang: Optional[Iterable[str]] = None) -> float:
    """
    Returns the main score for a given set of languages.

    Args:
        lang: List of languages to get the main score for.

    Returns:
        The main score.
    """
    main_scores = []
    if lang is None:
        lang = self.scores.keys()

    for l in lang:
        main_scores.append(self.scores[l][self.main_score])  # type: ignore

    return sum(main_scores) / len(main_scores)

`name_to_path()` ¶

Convert a name to a path.

Source code in src/seb/result_dataclasses.py

def name_to_path(self) -> str:
    """
    Convert a name to a path.
    """
    name = self.task_name.replace("/", "__").replace(" ", "_")
    return name

`to_disk(path)` ¶

Write task results to a path.

Source code in src/seb/result_dataclasses.py

def to_disk(self, path: Path) -> None:
    """
    Write task results to a path.
    """
    path.parent.mkdir(parents=True, exist_ok=True)
    json_str: str = self.model_dump_json()  # type: ignore

    with path.open("w") as f:
        f.write(json_str)

`seb.TaskError` ¶

Bases: BaseModel

Source code in src/seb/result_dataclasses.py

class TaskError(BaseModel):
    task_name: str
    error: str
    time_of_run: datetime
    languages: list[str] = []

    def to_disk(self, path: Path) -> None:
        """
        Write task results to a path.
        """
        path.parent.mkdir(parents=True, exist_ok=True)
        json_str: str = self.model_dump_json()  # type: ignore

        with path.open("w") as f:
            f.write(json_str)

    @classmethod
    def from_disk(cls, path: Path) -> "TaskError":
        """
        Load task results from a path.
        """
        with path.open() as f:
            task_results = json.load(f)
        return cls(**task_results)

    @staticmethod
    def get_main_score(lang: Optional[Iterable[str]] = None) -> float:  # noqa: ARG004
        return np.nan

    def name_to_path(self) -> str:
        """
        Convert a name to a path.
        """
        name = self.task_name.replace("/", "__").replace(" ", "_")
        return name

`from_disk(path)` `classmethod` ¶

Load task results from a path.

Source code in src/seb/result_dataclasses.py

@classmethod
def from_disk(cls, path: Path) -> "TaskError":
    """
    Load task results from a path.
    """
    with path.open() as f:
        task_results = json.load(f)
    return cls(**task_results)

`name_to_path()` ¶

Convert a name to a path.

Source code in src/seb/result_dataclasses.py

def name_to_path(self) -> str:
    """
    Convert a name to a path.
    """
    name = self.task_name.replace("/", "__").replace(" ", "_")
    return name

`to_disk(path)` ¶

Write task results to a path.

Source code in src/seb/result_dataclasses.py

def to_disk(self, path: Path) -> None:
    """
    Write task results to a path.
    """
    path.parent.mkdir(parents=True, exist_ok=True)
    json_str: str = self.model_dump_json()  # type: ignore

    with path.open("w") as f:
        f.write(json_str)

API¶

General¶

seb.get_task(name) ¶

seb.get_all_tasks() ¶

seb.get_model(name) ¶

seb.get_all_models() ¶

Benchmark¶

seb.Benchmark ¶

__init__(languages=None, tasks=None) ¶

evaluate_model(model, use_cache=True, run_model=True, raise_errors=True, cache_dir=None, verbose=True) ¶

evaluate_models(models, use_cache=True, run_model=True, raise_errors=True, cache_dir=None, verbose=True) ¶

get_tasks(tasks, languages) staticmethod ¶

Interfaces¶

Model Interface¶

seb.Encoder ¶

encode(sentences, *, task=None, batch_size=32, **kwargs) ¶

seb.LazyLoadEncoder dataclass ¶

model: Encoder property ¶

encode(sentences, *, task=None, **kwargs) ¶

load_model() ¶

seb.SebModel dataclass ¶

number_of_parameters: Optional[int] property ¶

Task Interface¶

seb.Task ¶

evaluate(model) ¶

get_documents() ¶

name_to_path() ¶

Data Classes¶

seb.BenchmarkResults ¶

from_disk(path) classmethod ¶

to_disk(path) ¶

seb.TaskResult ¶

languages: list[Language] property ¶

from_disk(path) classmethod ¶

get_main_score(lang=None) ¶

name_to_path() ¶

to_disk(path) ¶

seb.TaskError ¶

from_disk(path) classmethod ¶

name_to_path() ¶

to_disk(path) ¶

`seb.get_task(name)` ¶

`seb.get_all_tasks()` ¶

`seb.get_model(name)` ¶

`seb.get_all_models()` ¶

`seb.Benchmark` ¶

`init(languages=None, tasks=None)` ¶

`evaluate_model(model, use_cache=True, run_model=True, raise_errors=True, cache_dir=None, verbose=True)` ¶

`evaluate_models(models, use_cache=True, run_model=True, raise_errors=True, cache_dir=None, verbose=True)` ¶

`get_tasks(tasks, languages)` `staticmethod` ¶

`seb.Encoder` ¶

`encode(sentences, *, task=None, batch_size=32, **kwargs)` ¶

`seb.LazyLoadEncoder` `dataclass` ¶

`model: Encoder` `property` ¶

`encode(sentences, *, task=None, **kwargs)` ¶

`load_model()` ¶

`seb.SebModel` `dataclass` ¶

`number_of_parameters: Optional[int]` `property` ¶

`seb.Task` ¶

`evaluate(model)` ¶

`get_documents()` ¶

`name_to_path()` ¶

`seb.BenchmarkResults` ¶

`from_disk(path)` `classmethod` ¶

`to_disk(path)` ¶

`seb.TaskResult` ¶

`languages: list[Language]` `property` ¶

`from_disk(path)` `classmethod` ¶

`get_main_score(lang=None)` ¶

`name_to_path()` ¶

`to_disk(path)` ¶

`seb.TaskError` ¶

`from_disk(path)` `classmethod` ¶

`name_to_path()` ¶

`to_disk(path)` ¶