API¶
General¶
General function for dealing with tasks and models implemented in SEB.
seb.registries.get_task(name)
¶
Fetches a task by name.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
name |
str |
The name of the task. |
required |
Returns:
Type | Description |
---|---|
Task |
A task. |
Source code in seb/registries.py
def get_task(name: str) -> Task:
"""
Fetches a task by name.
Args:
name: The name of the task.
Returns:
A task.
"""
return tasks.get(name)()
seb.registries.get_all_tasks()
¶
Returns all tasks implemented in SEB.
Returns:
Type | Description |
---|---|
list |
A list of all tasks in SEB. |
Source code in seb/registries.py
def get_all_tasks() -> list[Task]:
"""
Returns all tasks implemented in SEB.
Returns:
A list of all tasks in SEB.
"""
return [get_task(task_name) for task_name in tasks.get_all()]
seb.registries.get_model(name)
¶
Fetches a model by name.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
name |
str |
The name of the model. |
required |
Returns:
Type | Description |
---|---|
SebModel |
A model including metadata. |
Source code in seb/registries.py
def get_model(name: str) -> SebModel:
"""
Fetches a model by name.
Args:
name: The name of the model.
Returns:
A model including metadata.
"""
return models.get(name)()
seb.registries.get_all_models()
¶
Get all the models implemented in SEB.
Returns:
Type | Description |
---|---|
list |
A list of all models in SEB. |
Source code in seb/registries.py
def get_all_models() -> list[SebModel]:
"""
Get all the models implemented in SEB.
Returns:
A list of all models in SEB.
"""
return [get_model(model_name) for model_name in models.get_all()]
Benchmark¶
seb.benchmark.Benchmark
¶
Benchmark is the main orchestrator of the SEB benchmark.
Source code in seb/benchmark.py
class Benchmark:
"""
Benchmark is the main orchestrator of the SEB benchmark.
"""
def __init__(
self,
languages: Optional[list[str]] = None,
tasks: Optional[Union[Iterable[str], Iterable[Task]]] = None,
) -> None:
"""
Initialize the benchmark.
Args:
languages: A list of languages to run the benchmark on. If None, all languages are used.
tasks: The tasks to run the benchmark on. If None, all tasks are used. Can either be specified as strings or as Task objects.
"""
self.languages = languages
self.tasks = self.get_tasks(tasks, languages)
self.task_names = [task.name for task in self.tasks]
@staticmethod
def get_tasks(
tasks: Optional[Union[Iterable[str], Iterable[Task]]],
languages: Optional[list[str]],
) -> list[Task]:
"""
Get the tasks for the benchmark.
Returns:
A list of tasks.
"""
_tasks = []
if tasks is None:
_tasks = get_all_tasks()
else:
for task in tasks:
if isinstance(task, str):
_tasks.append(get_task(task))
elif isinstance(task, Task):
_tasks.append(task)
else:
raise ValueError(f"Invalid task type: {type(task)}")
if languages is not None:
langs = set(languages)
_tasks = [task for task in _tasks if set(task.languages) & langs]
return _tasks
def evaluate_model(
self,
model: SebModel,
use_cache: bool = True,
run_model: bool = True,
raise_errors: bool = True,
cache_dir: Optional[Path] = None,
verbose: bool = True,
) -> BenchmarkResults:
"""
Evaluate a model on the benchmark.
Args:
model: The model to evaluate.
use_cache: Whether to use the cache.
run_model: Whether to run the model if the cache is not present.
raise_errors: Whether to raise errors.
cache_dir: The cache directory to use. If None, the default cache directory is used.
verbose: Whether to show a progress bar.
Returns:
The results of the benchmark.
"""
task_results = []
pbar = tqdm(
self.tasks,
position=1,
desc=f"Running {model.meta.name}",
leave=False,
disable=not verbose,
)
for task in pbar:
pbar.set_description(f"Running {model.meta.name} on {task.name}")
task_result = run_task(
task,
model,
use_cache=use_cache,
run_model=run_model,
raise_errors=raise_errors,
cache_dir=cache_dir,
)
task_results.append(task_result)
return BenchmarkResults(meta=model.meta, task_results=task_results)
def evaluate_models(
self,
models: list[SebModel],
use_cache: bool = True,
run_model: bool = True,
raise_errors: bool = True,
cache_dir: Optional[Path] = None,
verbose: bool = True,
) -> list[BenchmarkResults]:
"""
Evaluate a list of models on the benchmark.
Args:
models: The models to evaluate.
use_cache: Whether to use the cache.
run_model: Whether to run the model if the cache is not present.
raise_errors: Whether to raise errors.
cache_dir: The cache directory to use. If None, the default cache directory is used.
verbose: Whether to show a progress bar.
Returns:
The results of the benchmark, once for each model.
"""
results = []
pbar = tqdm(
models,
position=0,
desc="Running Benchmark",
leave=True,
disable=not verbose,
)
for model in pbar:
pbar.set_description(f"Running {model.meta.name}")
results.append(
self.evaluate_model(
model,
use_cache=use_cache,
run_model=run_model,
raise_errors=raise_errors,
cache_dir=cache_dir,
verbose=verbose,
),
)
return results
__init__(self, languages=None, tasks=None)
special
¶
Initialize the benchmark.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
languages |
Optional[list[str]] |
A list of languages to run the benchmark on. If None, all languages are used. |
None |
tasks |
Union[collections.abc.Iterable[str], collections.abc.Iterable[seb.interfaces.task.Task]] |
The tasks to run the benchmark on. If None, all tasks are used. Can either be specified as strings or as Task objects. |
None |
Source code in seb/benchmark.py
def __init__(
self,
languages: Optional[list[str]] = None,
tasks: Optional[Union[Iterable[str], Iterable[Task]]] = None,
) -> None:
"""
Initialize the benchmark.
Args:
languages: A list of languages to run the benchmark on. If None, all languages are used.
tasks: The tasks to run the benchmark on. If None, all tasks are used. Can either be specified as strings or as Task objects.
"""
self.languages = languages
self.tasks = self.get_tasks(tasks, languages)
self.task_names = [task.name for task in self.tasks]
evaluate_model(self, model, use_cache=True, run_model=True, raise_errors=True, cache_dir=None, verbose=True)
¶
Evaluate a model on the benchmark.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model |
SebModel |
The model to evaluate. |
required |
use_cache |
bool |
Whether to use the cache. |
True |
run_model |
bool |
Whether to run the model if the cache is not present. |
True |
raise_errors |
bool |
Whether to raise errors. |
True |
cache_dir |
Optional[pathlib.Path] |
The cache directory to use. If None, the default cache directory is used. |
None |
verbose |
bool |
Whether to show a progress bar. |
True |
Returns:
Type | Description |
---|---|
BenchmarkResults |
The results of the benchmark. |
Source code in seb/benchmark.py
def evaluate_model(
self,
model: SebModel,
use_cache: bool = True,
run_model: bool = True,
raise_errors: bool = True,
cache_dir: Optional[Path] = None,
verbose: bool = True,
) -> BenchmarkResults:
"""
Evaluate a model on the benchmark.
Args:
model: The model to evaluate.
use_cache: Whether to use the cache.
run_model: Whether to run the model if the cache is not present.
raise_errors: Whether to raise errors.
cache_dir: The cache directory to use. If None, the default cache directory is used.
verbose: Whether to show a progress bar.
Returns:
The results of the benchmark.
"""
task_results = []
pbar = tqdm(
self.tasks,
position=1,
desc=f"Running {model.meta.name}",
leave=False,
disable=not verbose,
)
for task in pbar:
pbar.set_description(f"Running {model.meta.name} on {task.name}")
task_result = run_task(
task,
model,
use_cache=use_cache,
run_model=run_model,
raise_errors=raise_errors,
cache_dir=cache_dir,
)
task_results.append(task_result)
return BenchmarkResults(meta=model.meta, task_results=task_results)
evaluate_models(self, models, use_cache=True, run_model=True, raise_errors=True, cache_dir=None, verbose=True)
¶
Evaluate a list of models on the benchmark.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
models |
list |
The models to evaluate. |
required |
use_cache |
bool |
Whether to use the cache. |
True |
run_model |
bool |
Whether to run the model if the cache is not present. |
True |
raise_errors |
bool |
Whether to raise errors. |
True |
cache_dir |
Optional[pathlib.Path] |
The cache directory to use. If None, the default cache directory is used. |
None |
verbose |
bool |
Whether to show a progress bar. |
True |
Returns:
Type | Description |
---|---|
list |
The results of the benchmark, once for each model. |
Source code in seb/benchmark.py
def evaluate_models(
self,
models: list[SebModel],
use_cache: bool = True,
run_model: bool = True,
raise_errors: bool = True,
cache_dir: Optional[Path] = None,
verbose: bool = True,
) -> list[BenchmarkResults]:
"""
Evaluate a list of models on the benchmark.
Args:
models: The models to evaluate.
use_cache: Whether to use the cache.
run_model: Whether to run the model if the cache is not present.
raise_errors: Whether to raise errors.
cache_dir: The cache directory to use. If None, the default cache directory is used.
verbose: Whether to show a progress bar.
Returns:
The results of the benchmark, once for each model.
"""
results = []
pbar = tqdm(
models,
position=0,
desc="Running Benchmark",
leave=True,
disable=not verbose,
)
for model in pbar:
pbar.set_description(f"Running {model.meta.name}")
results.append(
self.evaluate_model(
model,
use_cache=use_cache,
run_model=run_model,
raise_errors=raise_errors,
cache_dir=cache_dir,
verbose=verbose,
),
)
return results
get_tasks(tasks, languages)
staticmethod
¶
Get the tasks for the benchmark.
Returns:
Type | Description |
---|---|
list |
A list of tasks. |
Source code in seb/benchmark.py
@staticmethod
def get_tasks(
tasks: Optional[Union[Iterable[str], Iterable[Task]]],
languages: Optional[list[str]],
) -> list[Task]:
"""
Get the tasks for the benchmark.
Returns:
A list of tasks.
"""
_tasks = []
if tasks is None:
_tasks = get_all_tasks()
else:
for task in tasks:
if isinstance(task, str):
_tasks.append(get_task(task))
elif isinstance(task, Task):
_tasks.append(task)
else:
raise ValueError(f"Invalid task type: {type(task)}")
if languages is not None:
langs = set(languages)
_tasks = [task for task in _tasks if set(task.languages) & langs]
return _tasks
Interfaces¶
SEB implements to main interfaces. A task interface which is a tasks within the Benchmark and a model interface which is a model applied to the tasks.
Model Interface¶
seb.interfaces.model.Encoder (Protocol)
¶
Interface which all models must implement.
Source code in seb/interfaces/model.py
@runtime_checkable
class Encoder(Protocol):
"""
Interface which all models must implement.
"""
def encode(
self,
sentences: list[str],
*,
task: Optional["Task"] = None,
batch_size: int = 32,
**kwargs: Any,
) -> np.ndarray:
"""Returns a list of embeddings for the given sentences.
Args:
sentences: List of sentences to encode
task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
to be used.
batch_size: Batch size for the encoding
kwargs: arguments to pass to the models encode method
Returns:
Embeddings for the given documents
"""
...
# The following methods are optional and can be implemented if the model supports them.
# def to(self, device: torch.device):
# ...
# def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray:
# ...
# def encode_corpus(self, corpus: list[dict[str, str]], **kwargs: Any) -> np.ndarray:
# ...
encode(self, sentences, *, task=None, batch_size=32, **kwargs)
¶
Returns a list of embeddings for the given sentences.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
sentences |
list |
List of sentences to encode |
required |
task |
Optional[Task] |
The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need to be used. |
None |
batch_size |
int |
Batch size for the encoding |
32 |
kwargs |
Any |
arguments to pass to the models encode method |
{} |
Returns:
Type | Description |
---|---|
ndarray |
Embeddings for the given documents |
Source code in seb/interfaces/model.py
def encode(
self,
sentences: list[str],
*,
task: Optional["Task"] = None,
batch_size: int = 32,
**kwargs: Any,
) -> np.ndarray:
"""Returns a list of embeddings for the given sentences.
Args:
sentences: List of sentences to encode
task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
to be used.
batch_size: Batch size for the encoding
kwargs: arguments to pass to the models encode method
Returns:
Embeddings for the given documents
"""
...
seb.interfaces.model.LazyLoadEncoder (Encoder)
dataclass
¶
Encoder object, which lazy loads the model on the first call to encode()
Source code in seb/interfaces/model.py
@dataclass
class LazyLoadEncoder(Encoder):
"""Encoder object, which lazy loads the model on the first call to encode()"""
loader: Callable[[], Encoder]
_model: Optional[Encoder] = None
def load_model(self):
"""
Load the model.
"""
if self._model is None:
self._model = self.loader()
def to(self, device: torch.device):
self.load_model()
try:
self._model = self._model.to(device) # type: ignore
except AttributeError:
logging.debug(f"Model {self._model} does not have a to method")
@property
def model(self) -> Encoder:
"""
Dynimically load the model.
"""
self.load_model()
return self._model # type: ignore
def encode(
self,
sentences: list[str],
*,
task: Optional["Task"] = None,
**kwargs: Any,
) -> np.ndarray:
"""
Returns a list of embeddings for the given sentences.
Args:
sentences: List of sentences to encode
task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
to be used.
batch_size: Batch size for the encoding
kwargs: arguments to pass to the models encode method
Returns:
Embeddings for the given documents
"""
return self.model.encode(sentences, task=task, **kwargs)
def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray:
try:
return self.model.encode_queries(queries, **kwargs) # type: ignore
except AttributeError:
return self.encode(queries, **kwargs)
def encode_corpus(self, corpus: list[dict[str, str]], **kwargs: Any) -> np.ndarray:
try:
return self.model.encode_corpus(corpus, **kwargs) # type: ignore
except AttributeError:
sep = " "
if isinstance(corpus, dict):
sentences = [
(corpus["title"][i] + sep + corpus["text"][i]).strip() if "title" in corpus else corpus["text"][i].strip() # type: ignore
for i in range(len(corpus["text"])) # type: ignore
]
else:
sentences = [(doc["title"] + sep + doc["text"]).strip() if "title" in doc else doc["text"].strip() for doc in corpus]
return self.encode(sentences, **kwargs)
model: Encoder
property
readonly
¶
Dynimically load the model.
__init__(self, loader, _model=None)
special
¶
Initialize self. See help(type(self)) for accurate signature.
encode(self, sentences, *, task=None, **kwargs)
¶
Returns a list of embeddings for the given sentences.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
sentences |
list |
List of sentences to encode |
required |
task |
Optional[Task] |
The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need to be used. |
None |
batch_size |
Batch size for the encoding |
required | |
kwargs |
Any |
arguments to pass to the models encode method |
{} |
Returns:
Type | Description |
---|---|
ndarray |
Embeddings for the given documents |
Source code in seb/interfaces/model.py
def encode(
self,
sentences: list[str],
*,
task: Optional["Task"] = None,
**kwargs: Any,
) -> np.ndarray:
"""
Returns a list of embeddings for the given sentences.
Args:
sentences: List of sentences to encode
task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
to be used.
batch_size: Batch size for the encoding
kwargs: arguments to pass to the models encode method
Returns:
Embeddings for the given documents
"""
return self.model.encode(sentences, task=task, **kwargs)
load_model(self)
¶
Load the model.
Source code in seb/interfaces/model.py
def load_model(self):
"""
Load the model.
"""
if self._model is None:
self._model = self.loader()
seb.interfaces.model.SebModel
dataclass
¶
An embedding model as implemented in SEB. It notably dynamically loads models (such that models are not loaded when a cache is hit) and includes metadata pertaining to the specific model.
Source code in seb/interfaces/model.py
@dataclass
class SebModel:
"""
An embedding model as implemented in SEB. It notably dynamically loads models (such that models are not loaded when a cache is hit)
and includes metadata pertaining to the specific model.
"""
meta: ModelMeta
encoder: Encoder
@property
def number_of_parameters(self) -> Optional[int]:
"""
Returns the number of parameters in the model.
"""
if hasattr(self.encoder, "num_parameters"):
return sum(p.numel() for p in self.model.parameters() if p.requires_grad) # type: ignore
return None
number_of_parameters: Optional[int]
property
readonly
¶
Returns the number of parameters in the model.
Task Interface¶
seb.interfaces.task.Task (Protocol)
¶
A task is a specific evaluation task for a sentence embedding model.
Attributes:
Name | Type | Description |
---|---|---|
name |
str |
The name of the task. |
main_score |
str |
The main score of the task. |
reference |
str |
A reference to the task. |
version |
str |
The version of the task. |
languages |
list |
The languages of the task. |
domain |
list |
The domains of the task. Should be one of the categories listed on https://universaldependencies.org |
task_type |
Literal['Classification', 'Retrieval', 'STS', 'BitextMining', 'Clustering', 'Speed'] |
A list of task types, determines how the task is being evaluated. E.g. Classification. |
task_subtypes |
list |
a list of subtypes e.g. Sentiment Classification. |
description |
str |
A description of the task. |
Source code in seb/interfaces/task.py
@runtime_checkable
class Task(Protocol):
"""
A task is a specific evaluation task for a sentence embedding model.
Attributes:
name: The name of the task.
main_score: The main score of the task.
reference: A reference to the task.
version: The version of the task.
languages: The languages of the task.
domain: The domains of the task. Should be one of the categories listed on https://universaldependencies.org
task_type: A list of task types, determines how the task is being evaluated. E.g. Classification.
task_subtypes: a list of subtypes e.g. Sentiment Classification.
description: A description of the task.
"""
name: str
main_score: str
reference: str
version: str
languages: list[Language]
domain: list[Domain]
task_type: TaskType
task_subtypes: list[str]
description: str
def evaluate(self, model: Encoder) -> TaskResult:
"""
Evaluates a Sentence Embedding Model on the task.
Args:
model: A model with the encode method implemented.
Returns:
A TaskResult object.
"""
...
def get_documents(self) -> list[str]:
"""
Get the documents for the task.
Returns:
A list of strings.
"""
...
def get_descriptive_stats(self) -> DescriptiveDatasetStats:
texts = self.get_documents()
document_lengths = np.array([len(text) for text in texts])
mean = float(np.mean(document_lengths))
std = float(np.std(document_lengths))
return DescriptiveDatasetStats(
mean_document_length=mean,
std_document_length=std,
num_documents=len(document_lengths),
)
def name_to_path(self) -> str:
"""
Convert a name to a path.
"""
name = self.name.replace("/", "__").replace(" ", "_")
return name
evaluate(self, model)
¶
Evaluates a Sentence Embedding Model on the task.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model |
Encoder |
A model with the encode method implemented. |
required |
Returns:
Type | Description |
---|---|
TaskResult |
A TaskResult object. |
Source code in seb/interfaces/task.py
def evaluate(self, model: Encoder) -> TaskResult:
"""
Evaluates a Sentence Embedding Model on the task.
Args:
model: A model with the encode method implemented.
Returns:
A TaskResult object.
"""
...
get_documents(self)
¶
Get the documents for the task.
Returns:
Type | Description |
---|---|
list |
A list of strings. |
Source code in seb/interfaces/task.py
def get_documents(self) -> list[str]:
"""
Get the documents for the task.
Returns:
A list of strings.
"""
...
name_to_path(self)
¶
Convert a name to a path.
Source code in seb/interfaces/task.py
def name_to_path(self) -> str:
"""
Convert a name to a path.
"""
name = self.name.replace("/", "__").replace(" ", "_")
return name
Data Classes¶
SEB uses data classes to store the results of a benchmark. The following classes are available:
seb.result_dataclasses.BenchmarkResults (BaseModel)
¶
Dataclass for storing benchmark results.
Attributes:
Name | Type | Description |
---|---|---|
meta |
ModelMeta |
ModelMeta object. |
task_results |
list |
List of TaskResult objects. |
Source code in seb/result_dataclasses.py
class BenchmarkResults(BaseModel):
"""
Dataclass for storing benchmark results.
Attributes:
meta: ModelMeta object.
task_results: List of TaskResult objects.
"""
meta: ModelMeta
task_results: list[Union[TaskResult, TaskError]]
def get_main_score(self, lang: Optional[Iterable[Language]] = None) -> float:
scores = [t.get_main_score(lang) for t in self.task_results]
if scores:
return sum(scores) / len(scores)
return np.nan
def __iter__(self) -> Iterator[Union[TaskResult, TaskError]]: # type: ignore
return iter(self.task_results)
def __getitem__(self, index: int) -> Union[TaskResult, TaskError]:
return self.task_results[index]
def __len__(self) -> int:
return len(self.task_results)
def to_disk(self, path: Path) -> None:
"""
Write task results to a path.
"""
if path.is_file():
raise ValueError("Can't save BenchmarkResults to a file. Path must be a directory.")
path.mkdir(parents=True, exist_ok=True)
for task_result in self.task_results:
if isinstance(task_result, TaskResult):
task_result.to_disk(path / f"{task_result.task_name}.json")
else:
task_result.to_disk(path / f"{task_result.task_name}.error.json")
meta_path = path / "meta.json"
self.meta.to_disk(meta_path)
@classmethod
def from_disk(cls, path: Path) -> "BenchmarkResults":
"""
Load task results from a path.
"""
if not path.is_dir():
raise ValueError("Can't load BenchmarkResults from path: {path}. Path must be a directory.")
task_results = []
for file in path.glob("*.json"):
if file.stem == "meta":
continue
if file.stem.endswith(".error"):
task_results.append(TaskError.from_disk(file))
else:
task_results.append(TaskResult.from_disk(file))
meta_path = path / "meta.json"
meta = ModelMeta.from_disk(meta_path)
return cls(meta=meta, task_results=task_results)
__class_vars__
special
¶
The names of the class variables defined on the model.
__private_attributes__
special
¶
Metadata about the private attributes of the model.
__pydantic_complete__
special
¶
Whether model building is completed, or if there are still undefined fields.
__pydantic_custom_init__
special
¶
Whether the model has a custom __init__
method.
__pydantic_decorators__
special
¶
Metadata containing the decorators defined on the model.
This replaces Model.__validators__
and Model.__root_validators__
from Pydantic V1.
__pydantic_generic_metadata__
special
¶
Metadata for generic models; contains data used for a similar purpose to args, origin, parameters in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__
special
¶
Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__
special
¶
The name of the post-init method for the model, if defined.
__signature__
special
¶
The synthesized __init__
[Signature
][inspect.Signature] of the model.
model_computed_fields
¶
A dictionary of computed field names and their corresponding ComputedFieldInfo
objects.
model_config
¶
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
model_fields
¶
Metadata about the fields defined on the model,
mapping of field names to [FieldInfo
][pydantic.fields.FieldInfo] objects.
This replaces Model.__fields__
from Pydantic V1.
from_disk(path)
classmethod
¶
Load task results from a path.
Source code in seb/result_dataclasses.py
@classmethod
def from_disk(cls, path: Path) -> "BenchmarkResults":
"""
Load task results from a path.
"""
if not path.is_dir():
raise ValueError("Can't load BenchmarkResults from path: {path}. Path must be a directory.")
task_results = []
for file in path.glob("*.json"):
if file.stem == "meta":
continue
if file.stem.endswith(".error"):
task_results.append(TaskError.from_disk(file))
else:
task_results.append(TaskResult.from_disk(file))
meta_path = path / "meta.json"
meta = ModelMeta.from_disk(meta_path)
return cls(meta=meta, task_results=task_results)
to_disk(self, path)
¶
Write task results to a path.
Source code in seb/result_dataclasses.py
def to_disk(self, path: Path) -> None:
"""
Write task results to a path.
"""
if path.is_file():
raise ValueError("Can't save BenchmarkResults to a file. Path must be a directory.")
path.mkdir(parents=True, exist_ok=True)
for task_result in self.task_results:
if isinstance(task_result, TaskResult):
task_result.to_disk(path / f"{task_result.task_name}.json")
else:
task_result.to_disk(path / f"{task_result.task_name}.error.json")
meta_path = path / "meta.json"
self.meta.to_disk(meta_path)
seb.result_dataclasses.TaskResult (BaseModel)
¶
Dataclass for storing task results.
Attributes:
Name | Type | Description |
---|---|---|
task_name |
str |
Name of the task. |
task_description |
str |
Description of the task. |
task_version |
str |
Version of the task. |
time_of_run |
datetime |
Time of the run. |
scores |
dict |
Dictionary of scores on the form {language: {"metric": value}}. |
main_score |
str |
Name of the main score. |
Source code in seb/result_dataclasses.py
class TaskResult(BaseModel):
"""
Dataclass for storing task results.
Attributes:
task_name: Name of the task.
task_description: Description of the task.
task_version: Version of the task.
time_of_run: Time of the run.
scores: Dictionary of scores on the form {language: {"metric": value}}.
main_score: Name of the main score.
"""
task_name: str
task_description: str
task_version: str
time_of_run: datetime
scores: dict[Language, dict[str, Union[float, str]]] # {language: {"metric": value}}.
main_score: str
def get_main_score(self, lang: Optional[Iterable[str]] = None) -> float:
"""
Returns the main score for a given set of languages.
Args:
lang: List of languages to get the main score for.
Returns:
The main score.
"""
main_scores = []
if lang is None:
lang = self.scores.keys()
for l in lang:
main_scores.append(self.scores[l][self.main_score]) # type: ignore
return sum(main_scores) / len(main_scores)
@property
def languages(self) -> list[Language]:
"""
Returns the languages of the task.
"""
return list(self.scores.keys())
@classmethod
def from_disk(cls, path: Path) -> "TaskResult":
"""
Load task results from a path.
"""
with path.open("r") as f:
task_results = json.load(f)
return cls(**task_results)
def to_disk(self, path: Path) -> None:
"""
Write task results to a path.
"""
path.parent.mkdir(parents=True, exist_ok=True)
json_str: str = self.model_dump_json() # type: ignore
with path.open("w") as f:
f.write(json_str)
def name_to_path(self) -> str:
"""
Convert a name to a path.
"""
name = self.task_name.replace("/", "__").replace(" ", "_")
return name
__class_vars__
special
¶
The names of the class variables defined on the model.
__private_attributes__
special
¶
Metadata about the private attributes of the model.
__pydantic_complete__
special
¶
Whether model building is completed, or if there are still undefined fields.
__pydantic_custom_init__
special
¶
Whether the model has a custom __init__
method.
__pydantic_decorators__
special
¶
Metadata containing the decorators defined on the model.
This replaces Model.__validators__
and Model.__root_validators__
from Pydantic V1.
__pydantic_generic_metadata__
special
¶
Metadata for generic models; contains data used for a similar purpose to args, origin, parameters in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__
special
¶
Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__
special
¶
The name of the post-init method for the model, if defined.
__signature__
special
¶
The synthesized __init__
[Signature
][inspect.Signature] of the model.
languages: list
property
readonly
¶
Returns the languages of the task.
model_computed_fields
¶
A dictionary of computed field names and their corresponding ComputedFieldInfo
objects.
model_config
¶
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
model_fields
¶
Metadata about the fields defined on the model,
mapping of field names to [FieldInfo
][pydantic.fields.FieldInfo] objects.
This replaces Model.__fields__
from Pydantic V1.
from_disk(path)
classmethod
¶
Load task results from a path.
Source code in seb/result_dataclasses.py
@classmethod
def from_disk(cls, path: Path) -> "TaskResult":
"""
Load task results from a path.
"""
with path.open("r") as f:
task_results = json.load(f)
return cls(**task_results)
get_main_score(self, lang=None)
¶
Returns the main score for a given set of languages.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
lang |
Optional[collections.abc.Iterable[str]] |
List of languages to get the main score for. |
None |
Returns:
Type | Description |
---|---|
float |
The main score. |
Source code in seb/result_dataclasses.py
def get_main_score(self, lang: Optional[Iterable[str]] = None) -> float:
"""
Returns the main score for a given set of languages.
Args:
lang: List of languages to get the main score for.
Returns:
The main score.
"""
main_scores = []
if lang is None:
lang = self.scores.keys()
for l in lang:
main_scores.append(self.scores[l][self.main_score]) # type: ignore
return sum(main_scores) / len(main_scores)
name_to_path(self)
¶
Convert a name to a path.
Source code in seb/result_dataclasses.py
def name_to_path(self) -> str:
"""
Convert a name to a path.
"""
name = self.task_name.replace("/", "__").replace(" ", "_")
return name
to_disk(self, path)
¶
Write task results to a path.
Source code in seb/result_dataclasses.py
def to_disk(self, path: Path) -> None:
"""
Write task results to a path.
"""
path.parent.mkdir(parents=True, exist_ok=True)
json_str: str = self.model_dump_json() # type: ignore
with path.open("w") as f:
f.write(json_str)
seb.result_dataclasses.TaskError (BaseModel)
¶
Source code in seb/result_dataclasses.py
class TaskError(BaseModel):
task_name: str
error: str
time_of_run: datetime
languages: list[str] = []
def to_disk(self, path: Path) -> None:
"""
Write task results to a path.
"""
path.parent.mkdir(parents=True, exist_ok=True)
json_str: str = self.model_dump_json() # type: ignore
with path.open("w") as f:
f.write(json_str)
@classmethod
def from_disk(cls, path: Path) -> "TaskError":
"""
Load task results from a path.
"""
with path.open() as f:
task_results = json.load(f)
return cls(**task_results)
@staticmethod
def get_main_score(lang: Optional[Iterable[str]] = None) -> float: # noqa: ARG004
return np.nan
def name_to_path(self) -> str:
"""
Convert a name to a path.
"""
name = self.task_name.replace("/", "__").replace(" ", "_")
return name
__class_vars__
special
¶
The names of the class variables defined on the model.
__private_attributes__
special
¶
Metadata about the private attributes of the model.
__pydantic_complete__
special
¶
Whether model building is completed, or if there are still undefined fields.
__pydantic_custom_init__
special
¶
Whether the model has a custom __init__
method.
__pydantic_decorators__
special
¶
Metadata containing the decorators defined on the model.
This replaces Model.__validators__
and Model.__root_validators__
from Pydantic V1.
__pydantic_generic_metadata__
special
¶
Metadata for generic models; contains data used for a similar purpose to args, origin, parameters in typing-module generics. May eventually be replaced by these.
__pydantic_parent_namespace__
special
¶
Parent namespace of the model, used for automatic rebuilding of models.
__pydantic_post_init__
special
¶
The name of the post-init method for the model, if defined.
__signature__
special
¶
The synthesized __init__
[Signature
][inspect.Signature] of the model.
model_computed_fields
¶
A dictionary of computed field names and their corresponding ComputedFieldInfo
objects.
model_config
¶
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
model_fields
¶
Metadata about the fields defined on the model,
mapping of field names to [FieldInfo
][pydantic.fields.FieldInfo] objects.
This replaces Model.__fields__
from Pydantic V1.
from_disk(path)
classmethod
¶
Load task results from a path.
Source code in seb/result_dataclasses.py
@classmethod
def from_disk(cls, path: Path) -> "TaskError":
"""
Load task results from a path.
"""
with path.open() as f:
task_results = json.load(f)
return cls(**task_results)
name_to_path(self)
¶
Convert a name to a path.
Source code in seb/result_dataclasses.py
def name_to_path(self) -> str:
"""
Convert a name to a path.
"""
name = self.task_name.replace("/", "__").replace(" ", "_")
return name
to_disk(self, path)
¶
Write task results to a path.
Source code in seb/result_dataclasses.py
def to_disk(self, path: Path) -> None:
"""
Write task results to a path.
"""
path.parent.mkdir(parents=True, exist_ok=True)
json_str: str = self.model_dump_json() # type: ignore
with path.open("w") as f:
f.write(json_str)