Skip to content

API

General

General function for dealing with tasks and models implemented in SEB.

seb.get_task(name)

Fetches a task by name.

Parameters:

Name Type Description Default
name str

The name of the task.

required

Returns:

Type Description
Task

A task.

Source code in src/seb/registries.py
23
24
25
26
27
28
29
30
31
32
33
def get_task(name: str) -> Task:
    """
    Fetches a task by name.

    Args:
        name: The name of the task.

    Returns:
        A task.
    """
    return tasks.get(name)()

seb.get_all_tasks()

Returns all tasks implemented in SEB.

Returns:

Type Description
list[Task]

A list of all tasks in SEB.

Source code in src/seb/registries.py
36
37
38
39
40
41
42
43
def get_all_tasks() -> list[Task]:
    """
    Returns all tasks implemented in SEB.

    Returns:
        A list of all tasks in SEB.
    """
    return [get_task(task_name) for task_name in tasks.get_all()]

seb.get_model(name)

Fetches a model by name.

Parameters:

Name Type Description Default
name str

The name of the model.

required

Returns:

Type Description
SebModel

A model including metadata.

Source code in src/seb/registries.py
10
11
12
13
14
15
16
17
18
19
20
def get_model(name: str) -> SebModel:
    """
    Fetches a model by name.

    Args:
        name: The name of the model.

    Returns:
        A model including metadata.
    """
    return models.get(name)()

seb.get_all_models()

Get all the models implemented in SEB.

Returns:

Type Description
list[SebModel]

A list of all models in SEB.

Source code in src/seb/registries.py
46
47
48
49
50
51
52
53
def get_all_models() -> list[SebModel]:
    """
    Get all the models implemented in SEB.

    Returns:
        A list of all models in SEB.
    """
    return [get_model(model_name) for model_name in models.get_all()]

Benchmark

seb.Benchmark

Benchmark is the main orchestrator of the SEB benchmark.

Source code in src/seb/benchmark.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
class Benchmark:
    """
    Benchmark is the main orchestrator of the SEB benchmark.
    """

    def __init__(
        self,
        languages: Optional[list[str]] = None,
        tasks: Optional[Union[Iterable[str], Iterable[Task]]] = None,
    ) -> None:
        """
        Initialize the benchmark.

        Args:
            languages: A list of languages to run the benchmark on. If None, all languages are used.
            tasks: The tasks to run the benchmark on. If None, all tasks are used. Can either be specified as strings or as Task objects.
        """
        self.languages = languages

        self.tasks = self.get_tasks(tasks, languages)
        self.task_names = [task.name for task in self.tasks]

    @staticmethod
    def get_tasks(
        tasks: Optional[Union[Iterable[str], Iterable[Task]]],
        languages: Optional[list[str]],
    ) -> list[Task]:
        """
        Get the tasks for the benchmark.

        Returns:
            A list of tasks.
        """
        _tasks = []

        if tasks is None:
            _tasks = get_all_tasks()
        else:
            for task in tasks:
                if isinstance(task, str):
                    _tasks.append(get_task(task))
                elif isinstance(task, Task):
                    _tasks.append(task)
                else:
                    raise ValueError(f"Invalid task type: {type(task)}")

        if languages is not None:
            langs = set(languages)
            _tasks = [task for task in _tasks if set(task.languages) & langs]

        return _tasks

    def evaluate_model(
        self,
        model: SebModel,
        use_cache: bool = True,
        run_model: bool = True,
        raise_errors: bool = True,
        cache_dir: Optional[Path] = None,
        verbose: bool = True,
    ) -> BenchmarkResults:
        """
        Evaluate a model on the benchmark.

        Args:
            model: The model to evaluate.
            use_cache: Whether to use the cache.
            run_model: Whether to run the model if the cache is not present.
            raise_errors: Whether to raise errors.
            cache_dir: The cache directory to use. If None, the default cache directory is used.
            verbose: Whether to show a progress bar.

        Returns:
            The results of the benchmark.
        """
        task_results = []
        pbar = tqdm(
            self.tasks,
            position=1,
            desc=f"Running {model.meta.name}",
            leave=False,
            disable=not verbose,
        )
        for task in pbar:
            pbar.set_description(f"Running {model.meta.name} on {task.name}")
            task_result = run_task(
                task,
                model,
                use_cache=use_cache,
                run_model=run_model,
                raise_errors=raise_errors,
                cache_dir=cache_dir,
            )
            task_results.append(task_result)

        return BenchmarkResults(meta=model.meta, task_results=task_results)

    def evaluate_models(
        self,
        models: list[SebModel],
        use_cache: bool = True,
        run_model: bool = True,
        raise_errors: bool = True,
        cache_dir: Optional[Path] = None,
        verbose: bool = True,
    ) -> list[BenchmarkResults]:
        """
        Evaluate a list of models on the benchmark.

        Args:
            models: The models to evaluate.
            use_cache: Whether to use the cache.
            run_model: Whether to run the model if the cache is not present.
            raise_errors: Whether to raise errors.
            cache_dir: The cache directory to use. If None, the default cache directory is used.
            verbose: Whether to show a progress bar.

        Returns:
            The results of the benchmark, once for each model.
        """
        results = []
        pbar = tqdm(
            models,
            position=0,
            desc="Running Benchmark",
            leave=True,
            disable=not verbose,
        )

        for model in pbar:
            pbar.set_description(f"Running {model.meta.name}")
            results.append(
                self.evaluate_model(
                    model,
                    use_cache=use_cache,
                    run_model=run_model,
                    raise_errors=raise_errors,
                    cache_dir=cache_dir,
                    verbose=verbose,
                ),
            )
        return results

__init__(languages=None, tasks=None)

Initialize the benchmark.

Parameters:

Name Type Description Default
languages Optional[list[str]]

A list of languages to run the benchmark on. If None, all languages are used.

None
tasks Optional[Union[Iterable[str], Iterable[Task]]]

The tasks to run the benchmark on. If None, all tasks are used. Can either be specified as strings or as Task objects.

None
Source code in src/seb/benchmark.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def __init__(
    self,
    languages: Optional[list[str]] = None,
    tasks: Optional[Union[Iterable[str], Iterable[Task]]] = None,
) -> None:
    """
    Initialize the benchmark.

    Args:
        languages: A list of languages to run the benchmark on. If None, all languages are used.
        tasks: The tasks to run the benchmark on. If None, all tasks are used. Can either be specified as strings or as Task objects.
    """
    self.languages = languages

    self.tasks = self.get_tasks(tasks, languages)
    self.task_names = [task.name for task in self.tasks]

evaluate_model(model, use_cache=True, run_model=True, raise_errors=True, cache_dir=None, verbose=True)

Evaluate a model on the benchmark.

Parameters:

Name Type Description Default
model SebModel

The model to evaluate.

required
use_cache bool

Whether to use the cache.

True
run_model bool

Whether to run the model if the cache is not present.

True
raise_errors bool

Whether to raise errors.

True
cache_dir Optional[Path]

The cache directory to use. If None, the default cache directory is used.

None
verbose bool

Whether to show a progress bar.

True

Returns:

Type Description
BenchmarkResults

The results of the benchmark.

Source code in src/seb/benchmark.py
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def evaluate_model(
    self,
    model: SebModel,
    use_cache: bool = True,
    run_model: bool = True,
    raise_errors: bool = True,
    cache_dir: Optional[Path] = None,
    verbose: bool = True,
) -> BenchmarkResults:
    """
    Evaluate a model on the benchmark.

    Args:
        model: The model to evaluate.
        use_cache: Whether to use the cache.
        run_model: Whether to run the model if the cache is not present.
        raise_errors: Whether to raise errors.
        cache_dir: The cache directory to use. If None, the default cache directory is used.
        verbose: Whether to show a progress bar.

    Returns:
        The results of the benchmark.
    """
    task_results = []
    pbar = tqdm(
        self.tasks,
        position=1,
        desc=f"Running {model.meta.name}",
        leave=False,
        disable=not verbose,
    )
    for task in pbar:
        pbar.set_description(f"Running {model.meta.name} on {task.name}")
        task_result = run_task(
            task,
            model,
            use_cache=use_cache,
            run_model=run_model,
            raise_errors=raise_errors,
            cache_dir=cache_dir,
        )
        task_results.append(task_result)

    return BenchmarkResults(meta=model.meta, task_results=task_results)

evaluate_models(models, use_cache=True, run_model=True, raise_errors=True, cache_dir=None, verbose=True)

Evaluate a list of models on the benchmark.

Parameters:

Name Type Description Default
models list[SebModel]

The models to evaluate.

required
use_cache bool

Whether to use the cache.

True
run_model bool

Whether to run the model if the cache is not present.

True
raise_errors bool

Whether to raise errors.

True
cache_dir Optional[Path]

The cache directory to use. If None, the default cache directory is used.

None
verbose bool

Whether to show a progress bar.

True

Returns:

Type Description
list[BenchmarkResults]

The results of the benchmark, once for each model.

Source code in src/seb/benchmark.py
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def evaluate_models(
    self,
    models: list[SebModel],
    use_cache: bool = True,
    run_model: bool = True,
    raise_errors: bool = True,
    cache_dir: Optional[Path] = None,
    verbose: bool = True,
) -> list[BenchmarkResults]:
    """
    Evaluate a list of models on the benchmark.

    Args:
        models: The models to evaluate.
        use_cache: Whether to use the cache.
        run_model: Whether to run the model if the cache is not present.
        raise_errors: Whether to raise errors.
        cache_dir: The cache directory to use. If None, the default cache directory is used.
        verbose: Whether to show a progress bar.

    Returns:
        The results of the benchmark, once for each model.
    """
    results = []
    pbar = tqdm(
        models,
        position=0,
        desc="Running Benchmark",
        leave=True,
        disable=not verbose,
    )

    for model in pbar:
        pbar.set_description(f"Running {model.meta.name}")
        results.append(
            self.evaluate_model(
                model,
                use_cache=use_cache,
                run_model=run_model,
                raise_errors=raise_errors,
                cache_dir=cache_dir,
                verbose=verbose,
            ),
        )
    return results

get_tasks(tasks, languages) staticmethod

Get the tasks for the benchmark.

Returns:

Type Description
list[Task]

A list of tasks.

Source code in src/seb/benchmark.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
@staticmethod
def get_tasks(
    tasks: Optional[Union[Iterable[str], Iterable[Task]]],
    languages: Optional[list[str]],
) -> list[Task]:
    """
    Get the tasks for the benchmark.

    Returns:
        A list of tasks.
    """
    _tasks = []

    if tasks is None:
        _tasks = get_all_tasks()
    else:
        for task in tasks:
            if isinstance(task, str):
                _tasks.append(get_task(task))
            elif isinstance(task, Task):
                _tasks.append(task)
            else:
                raise ValueError(f"Invalid task type: {type(task)}")

    if languages is not None:
        langs = set(languages)
        _tasks = [task for task in _tasks if set(task.languages) & langs]

    return _tasks

Interfaces

SEB implements to main interfaces. A task interface which is a tasks within the Benchmark and a model interface which is a model applied to the tasks.

Model Interface

seb.Encoder

Bases: Protocol

Interface which all models must implement.

Source code in src/seb/interfaces/model.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
@runtime_checkable
class Encoder(Protocol):
    """
    Interface which all models must implement.
    """

    def encode(
        self,
        sentences: list[str],
        *,
        task: Optional["Task"] = None,
        batch_size: int = 32,
        **kwargs: Any,
    ) -> np.ndarray:
        """Returns a list of embeddings for the given sentences.

        Args:
            sentences: List of sentences to encode
            task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
                to be used.
            batch_size: Batch size for the encoding
            kwargs: arguments to pass to the models encode method

        Returns:
            Embeddings for the given documents
        """
        ...

encode(sentences, *, task=None, batch_size=32, **kwargs)

Returns a list of embeddings for the given sentences.

Parameters:

Name Type Description Default
sentences list[str]

List of sentences to encode

required
task Optional[Task]

The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need to be used.

None
batch_size int

Batch size for the encoding

32
kwargs Any

arguments to pass to the models encode method

{}

Returns:

Type Description
ndarray

Embeddings for the given documents

Source code in src/seb/interfaces/model.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def encode(
    self,
    sentences: list[str],
    *,
    task: Optional["Task"] = None,
    batch_size: int = 32,
    **kwargs: Any,
) -> np.ndarray:
    """Returns a list of embeddings for the given sentences.

    Args:
        sentences: List of sentences to encode
        task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
            to be used.
        batch_size: Batch size for the encoding
        kwargs: arguments to pass to the models encode method

    Returns:
        Embeddings for the given documents
    """
    ...

seb.LazyLoadEncoder dataclass

Bases: Encoder

Encoder object, which lazy loads the model on the first call to encode()

Source code in src/seb/interfaces/model.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
@dataclass
class LazyLoadEncoder(Encoder):
    """Encoder object, which lazy loads the model on the first call to encode()"""

    loader: Callable[[], Encoder]
    _model: Optional[Encoder] = None

    def load_model(self):
        """
        Load the model.
        """
        if self._model is None:
            self._model = self.loader()

    def to(self, device: torch.device):
        self.load_model()
        try:
            self._model = self._model.to(device)  # type: ignore
        except AttributeError:
            logging.debug(f"Model {self._model} does not have a to method")

    @property
    def model(self) -> Encoder:
        """
        Dynimically load the model.
        """
        self.load_model()
        return self._model  # type: ignore

    def encode(
        self,
        sentences: list[str],
        *,
        task: Optional["Task"] = None,
        **kwargs: Any,
    ) -> np.ndarray:
        """
        Returns a list of embeddings for the given sentences.
        Args:
            sentences: List of sentences to encode
            task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
                to be used.
            batch_size: Batch size for the encoding
            kwargs: arguments to pass to the models encode method

        Returns:
            Embeddings for the given documents
        """
        return self.model.encode(sentences, task=task, **kwargs)

    def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray:
        try:
            return self.model.encode_queries(queries, **kwargs)  # type: ignore
        except AttributeError:
            return self.encode(queries, **kwargs)

    def encode_corpus(self, corpus: list[dict[str, str]], **kwargs: Any) -> np.ndarray:
        try:
            return self.model.encode_corpus(corpus, **kwargs)  # type: ignore
        except AttributeError:
            sep = " "
            if isinstance(corpus, dict):
                sentences = [
                    (corpus["title"][i] + sep + corpus["text"][i]).strip() if "title" in corpus else corpus["text"][i].strip()  # type: ignore
                    for i in range(len(corpus["text"]))  # type: ignore
                ]
            else:
                sentences = [(doc["title"] + sep + doc["text"]).strip() if "title" in doc else doc["text"].strip() for doc in corpus]
            return self.encode(sentences, **kwargs)

model: Encoder property

Dynimically load the model.

encode(sentences, *, task=None, **kwargs)

Returns a list of embeddings for the given sentences. Args: sentences: List of sentences to encode task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need to be used. batch_size: Batch size for the encoding kwargs: arguments to pass to the models encode method

Returns:

Type Description
ndarray

Embeddings for the given documents

Source code in src/seb/interfaces/model.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def encode(
    self,
    sentences: list[str],
    *,
    task: Optional["Task"] = None,
    **kwargs: Any,
) -> np.ndarray:
    """
    Returns a list of embeddings for the given sentences.
    Args:
        sentences: List of sentences to encode
        task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
            to be used.
        batch_size: Batch size for the encoding
        kwargs: arguments to pass to the models encode method

    Returns:
        Embeddings for the given documents
    """
    return self.model.encode(sentences, task=task, **kwargs)

load_model()

Load the model.

Source code in src/seb/interfaces/model.py
107
108
109
110
111
112
def load_model(self):
    """
    Load the model.
    """
    if self._model is None:
        self._model = self.loader()

seb.SebModel dataclass

An embedding model as implemented in SEB. It notably dynamically loads models (such that models are not loaded when a cache is hit) and includes metadata pertaining to the specific model.

Source code in src/seb/interfaces/model.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
@dataclass
class SebModel:
    """
    An embedding model as implemented in SEB. It notably dynamically loads models (such that models are not loaded when a cache is hit)
    and includes metadata pertaining to the specific model.
    """

    meta: ModelMeta
    encoder: Encoder

    @property
    def number_of_parameters(self) -> Optional[int]:
        """
        Returns the number of parameters in the model.
        """
        if hasattr(self.encoder, "num_parameters"):
            return sum(p.numel() for p in self.model.parameters() if p.requires_grad)  # type: ignore
        return None

number_of_parameters: Optional[int] property

Returns the number of parameters in the model.

Task Interface

seb.Task

Bases: Protocol

A task is a specific evaluation task for a sentence embedding model.

Attributes:

Name Type Description
name str

The name of the task.

main_score str

The main score of the task.

reference str

A reference to the task.

version str

The version of the task.

languages list[Language]

The languages of the task.

domain list[Domain]

The domains of the task. Should be one of the categories listed on https://universaldependencies.org

task_type TaskType

A list of task types, determines how the task is being evaluated. E.g. Classification.

task_subtypes list[str]

a list of subtypes e.g. Sentiment Classification.

description str

A description of the task.

Source code in src/seb/interfaces/task.py
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
@runtime_checkable
class Task(Protocol):
    """
    A task is a specific evaluation task for a sentence embedding model.

    Attributes:
        name: The name of the task.
        main_score: The main score of the task.
        reference: A reference to the task.
        version: The version of the task.
        languages: The languages of the task.
        domain: The domains of the task. Should be one of the categories listed on https://universaldependencies.org
        task_type: A list of task types, determines how the task is being evaluated. E.g. Classification.
        task_subtypes: a list of subtypes e.g. Sentiment Classification.
        description: A description of the task.
    """

    name: str
    main_score: str
    reference: str
    version: str
    languages: list[Language]
    domain: list[Domain]
    task_type: TaskType
    task_subtypes: list[str]
    description: str

    def evaluate(self, model: Encoder) -> TaskResult:
        """
        Evaluates a Sentence Embedding Model on the task.

        Args:
            model: A model with the encode method implemented.

        Returns:
            A TaskResult object.
        """
        ...

    def get_documents(self) -> list[str]:
        """
        Get the documents for the task.

        Returns:
            A list of strings.
        """
        ...

    def get_descriptive_stats(self) -> DescriptiveDatasetStats:
        texts = self.get_documents()
        document_lengths = np.array([len(text) for text in texts])

        mean = float(np.mean(document_lengths))
        std = float(np.std(document_lengths))
        return DescriptiveDatasetStats(
            mean_document_length=mean,
            std_document_length=std,
            num_documents=len(document_lengths),
        )

    def name_to_path(self) -> str:
        """
        Convert a name to a path.
        """
        name = self.name.replace("/", "__").replace(" ", "_")
        return name

evaluate(model)

Evaluates a Sentence Embedding Model on the task.

Parameters:

Name Type Description Default
model Encoder

A model with the encode method implemented.

required

Returns:

Type Description
TaskResult

A TaskResult object.

Source code in src/seb/interfaces/task.py
64
65
66
67
68
69
70
71
72
73
74
def evaluate(self, model: Encoder) -> TaskResult:
    """
    Evaluates a Sentence Embedding Model on the task.

    Args:
        model: A model with the encode method implemented.

    Returns:
        A TaskResult object.
    """
    ...

get_documents()

Get the documents for the task.

Returns:

Type Description
list[str]

A list of strings.

Source code in src/seb/interfaces/task.py
76
77
78
79
80
81
82
83
def get_documents(self) -> list[str]:
    """
    Get the documents for the task.

    Returns:
        A list of strings.
    """
    ...

name_to_path()

Convert a name to a path.

Source code in src/seb/interfaces/task.py
 97
 98
 99
100
101
102
def name_to_path(self) -> str:
    """
    Convert a name to a path.
    """
    name = self.name.replace("/", "__").replace(" ", "_")
    return name

Data Classes

SEB uses data classes to store the results of a benchmark. The following classes are available:

seb.BenchmarkResults

Bases: BaseModel

Dataclass for storing benchmark results.

Attributes:

Name Type Description
meta ModelMeta

ModelMeta object.

task_results list[Union[TaskResult, TaskError]]

List of TaskResult objects.

Source code in src/seb/result_dataclasses.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
class BenchmarkResults(BaseModel):
    """
    Dataclass for storing benchmark results.

    Attributes:
        meta: ModelMeta object.
        task_results: List of TaskResult objects.
    """

    meta: ModelMeta
    task_results: list[Union[TaskResult, TaskError]]

    def get_main_score(self, lang: Optional[Iterable[Language]] = None) -> float:
        scores = [t.get_main_score(lang) for t in self.task_results]
        if scores:
            return sum(scores) / len(scores)
        return np.nan

    def __iter__(self) -> Iterator[Union[TaskResult, TaskError]]:  # type: ignore
        return iter(self.task_results)

    def __getitem__(self, index: int) -> Union[TaskResult, TaskError]:
        return self.task_results[index]

    def __len__(self) -> int:
        return len(self.task_results)

    def to_disk(self, path: Path) -> None:
        """
        Write task results to a path.
        """
        if path.is_file():
            raise ValueError("Can't save BenchmarkResults to a file. Path must be a directory.")
        path.mkdir(parents=True, exist_ok=True)
        for task_result in self.task_results:
            if isinstance(task_result, TaskResult):
                task_result.to_disk(path / f"{task_result.task_name}.json")
            else:
                task_result.to_disk(path / f"{task_result.task_name}.error.json")

        meta_path = path / "meta.json"
        self.meta.to_disk(meta_path)

    @classmethod
    def from_disk(cls, path: Path) -> "BenchmarkResults":
        """
        Load task results from a path.
        """
        if not path.is_dir():
            raise ValueError("Can't load BenchmarkResults from path: {path}. Path must be a directory.")
        task_results = []
        for file in path.glob("*.json"):
            if file.stem == "meta":
                continue
            if file.stem.endswith(".error"):
                task_results.append(TaskError.from_disk(file))
            else:
                task_results.append(TaskResult.from_disk(file))

        meta_path = path / "meta.json"
        meta = ModelMeta.from_disk(meta_path)
        return cls(meta=meta, task_results=task_results)

from_disk(path) classmethod

Load task results from a path.

Source code in src/seb/result_dataclasses.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
@classmethod
def from_disk(cls, path: Path) -> "BenchmarkResults":
    """
    Load task results from a path.
    """
    if not path.is_dir():
        raise ValueError("Can't load BenchmarkResults from path: {path}. Path must be a directory.")
    task_results = []
    for file in path.glob("*.json"):
        if file.stem == "meta":
            continue
        if file.stem.endswith(".error"):
            task_results.append(TaskError.from_disk(file))
        else:
            task_results.append(TaskResult.from_disk(file))

    meta_path = path / "meta.json"
    meta = ModelMeta.from_disk(meta_path)
    return cls(meta=meta, task_results=task_results)

to_disk(path)

Write task results to a path.

Source code in src/seb/result_dataclasses.py
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def to_disk(self, path: Path) -> None:
    """
    Write task results to a path.
    """
    if path.is_file():
        raise ValueError("Can't save BenchmarkResults to a file. Path must be a directory.")
    path.mkdir(parents=True, exist_ok=True)
    for task_result in self.task_results:
        if isinstance(task_result, TaskResult):
            task_result.to_disk(path / f"{task_result.task_name}.json")
        else:
            task_result.to_disk(path / f"{task_result.task_name}.error.json")

    meta_path = path / "meta.json"
    self.meta.to_disk(meta_path)

seb.TaskResult

Bases: BaseModel

Dataclass for storing task results.

Attributes:

Name Type Description
task_name str

Name of the task.

task_description str

Description of the task.

task_version str

Version of the task.

time_of_run datetime

Time of the run.

scores dict[Language, dict[str, Union[float, str]]]

Dictionary of scores on the form {language: {"metric": value}}.

main_score str

Name of the main score.

Source code in src/seb/result_dataclasses.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
class TaskResult(BaseModel):
    """
    Dataclass for storing task results.

    Attributes:
        task_name: Name of the task.
        task_description: Description of the task.
        task_version: Version of the task.
        time_of_run: Time of the run.
        scores: Dictionary of scores on the form {language: {"metric": value}}.
        main_score: Name of the main score.
    """

    task_name: str
    task_description: str
    task_version: str
    time_of_run: datetime
    scores: dict[Language, dict[str, Union[float, str]]]  # {language: {"metric": value}}.
    main_score: str

    def get_main_score(self, lang: Optional[Iterable[str]] = None) -> float:
        """
        Returns the main score for a given set of languages.

        Args:
            lang: List of languages to get the main score for.

        Returns:
            The main score.
        """
        main_scores = []
        if lang is None:
            lang = self.scores.keys()

        for l in lang:
            main_scores.append(self.scores[l][self.main_score])  # type: ignore

        return sum(main_scores) / len(main_scores)

    @property
    def languages(self) -> list[Language]:
        """
        Returns the languages of the task.
        """
        return list(self.scores.keys())

    @classmethod
    def from_disk(cls, path: Path) -> "TaskResult":
        """
        Load task results from a path.
        """
        with path.open("r") as f:
            task_results = json.load(f)
        return cls(**task_results)

    def to_disk(self, path: Path) -> None:
        """
        Write task results to a path.
        """
        path.parent.mkdir(parents=True, exist_ok=True)
        json_str: str = self.model_dump_json()  # type: ignore

        with path.open("w") as f:
            f.write(json_str)

    def name_to_path(self) -> str:
        """
        Convert a name to a path.
        """
        name = self.task_name.replace("/", "__").replace(" ", "_")
        return name

languages: list[Language] property

Returns the languages of the task.

from_disk(path) classmethod

Load task results from a path.

Source code in src/seb/result_dataclasses.py
60
61
62
63
64
65
66
67
@classmethod
def from_disk(cls, path: Path) -> "TaskResult":
    """
    Load task results from a path.
    """
    with path.open("r") as f:
        task_results = json.load(f)
    return cls(**task_results)

get_main_score(lang=None)

Returns the main score for a given set of languages.

Parameters:

Name Type Description Default
lang Optional[Iterable[str]]

List of languages to get the main score for.

None

Returns:

Type Description
float

The main score.

Source code in src/seb/result_dataclasses.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def get_main_score(self, lang: Optional[Iterable[str]] = None) -> float:
    """
    Returns the main score for a given set of languages.

    Args:
        lang: List of languages to get the main score for.

    Returns:
        The main score.
    """
    main_scores = []
    if lang is None:
        lang = self.scores.keys()

    for l in lang:
        main_scores.append(self.scores[l][self.main_score])  # type: ignore

    return sum(main_scores) / len(main_scores)

name_to_path()

Convert a name to a path.

Source code in src/seb/result_dataclasses.py
79
80
81
82
83
84
def name_to_path(self) -> str:
    """
    Convert a name to a path.
    """
    name = self.task_name.replace("/", "__").replace(" ", "_")
    return name

to_disk(path)

Write task results to a path.

Source code in src/seb/result_dataclasses.py
69
70
71
72
73
74
75
76
77
def to_disk(self, path: Path) -> None:
    """
    Write task results to a path.
    """
    path.parent.mkdir(parents=True, exist_ok=True)
    json_str: str = self.model_dump_json()  # type: ignore

    with path.open("w") as f:
        f.write(json_str)

seb.TaskError

Bases: BaseModel

Source code in src/seb/result_dataclasses.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
class TaskError(BaseModel):
    task_name: str
    error: str
    time_of_run: datetime
    languages: list[str] = []

    def to_disk(self, path: Path) -> None:
        """
        Write task results to a path.
        """
        path.parent.mkdir(parents=True, exist_ok=True)
        json_str: str = self.model_dump_json()  # type: ignore

        with path.open("w") as f:
            f.write(json_str)

    @classmethod
    def from_disk(cls, path: Path) -> "TaskError":
        """
        Load task results from a path.
        """
        with path.open() as f:
            task_results = json.load(f)
        return cls(**task_results)

    @staticmethod
    def get_main_score(lang: Optional[Iterable[str]] = None) -> float:  # noqa: ARG004
        return np.nan

    def name_to_path(self) -> str:
        """
        Convert a name to a path.
        """
        name = self.task_name.replace("/", "__").replace(" ", "_")
        return name

from_disk(path) classmethod

Load task results from a path.

Source code in src/seb/result_dataclasses.py
103
104
105
106
107
108
109
110
@classmethod
def from_disk(cls, path: Path) -> "TaskError":
    """
    Load task results from a path.
    """
    with path.open() as f:
        task_results = json.load(f)
    return cls(**task_results)

name_to_path()

Convert a name to a path.

Source code in src/seb/result_dataclasses.py
116
117
118
119
120
121
def name_to_path(self) -> str:
    """
    Convert a name to a path.
    """
    name = self.task_name.replace("/", "__").replace(" ", "_")
    return name

to_disk(path)

Write task results to a path.

Source code in src/seb/result_dataclasses.py
 93
 94
 95
 96
 97
 98
 99
100
101
def to_disk(self, path: Path) -> None:
    """
    Write task results to a path.
    """
    path.parent.mkdir(parents=True, exist_ok=True)
    json_str: str = self.model_dump_json()  # type: ignore

    with path.open("w") as f:
        f.write(json_str)