Source code for augmenty.augment_utilities

"""Utility functions used for augmentation."""

import random
from functools import partial
from typing import Callable, Iterable, Iterator

from spacy.language import Language
from spacy.training import Example


[docs]def combine( augmenters: Iterable[Callable[[Language, Example], Iterator[Example]]], # type: ignore ) -> Callable[[Language, Example], Iterator[Example]]: # type: ignore """Combines a series of spaCy style augmenters. Args: augmenters: An list of spaCy augmenters. Returns: The combined augmenter Example: >>> char_swap_augmenter = augmenty.load("char_swap_v1", level=.02) >>> synonym_augmenter = augmenty.load("wordnet_synonym_v1", level=1, lang="en") >>> combined_aug = augmenty.combine([char_swap_augmenter, synonym_augmenter]) >>> # combine doc using two augmenters >>> augmented_docs = list(augmenty.docs(docs, augmenter=combined_aug, nlp=nlp)) """ def apply_multiple_augmenters(nlp: Language, example: Example): examples = [example] for aug in augmenters: examples = [e for example in examples for e in aug(nlp, example)] yield from examples return apply_multiple_augmenters
[docs]def set_doc_level( augmenter: Callable[[Language, Example], Iterator[Example]], # type: ignore level: float, ) -> Callable[[Language, Example], Iterator[Example]]: # type: ignore """Set the percantage of examples that the augmenter should be applied to. Args: augmenter: A spaCy augmenters which you only want to apply to a certain percentage of docs level: The percentage of docs the which should be augmented. Returns: The combined augmenter """ def __augment(nlp: Language, example: Example): if random.random() > level: yield example else: yield from augmenter(nlp, example) return __augment
[docs]def repeat( augmenter: Callable[[Language, Example], Iterator[Example]], # type: ignore n: int, ) -> Callable[[Language, Example], Iterator[Example]]: # type: ignore """Repeats an augmenter n times over the same example thus increasing the sample size. Args: augmenter: An augmenter. n: Number of times the augmenter should be repeated Returns: The repeated augmenter Example: >>> augmenter = augmenty.load("char_swap_v1", level=.02) >>> repeated_augmenter = augmenty.repeat(augmenter=aug, n=3) """ def __augment(nlp: Language, example: Example): for i in range(n): # type: ignore yield from augmenter(nlp, example) return __augment
[docs]def yield_original( augmenter: Callable[[Language, Example], Iterator[Example]], # type: ignore doc_level: float = 1.0, ) -> Callable[[Language, Example], Iterator[Example]]: # type: ignore """Wraps and augmented such that it yields both the original and augmented example. Args: augmenter: A spaCy augmenters. doc_level: The percentage of documents the augmenter should be applied to. Only yield the original when the original doc is augmented. Returns: The augmenter, which now yields both the original and augmented example. """ def __augment(nlp: Language, example: Example, level: float): if random.random() < level: yield from augmenter(nlp, example) yield example return partial(__augment, level=doc_level)
def make_text_from_orth(example_dict: dict) -> str: """Reconstructs the text based on ORTH and SPACY from an Example turned to dict.""" text = "" for orth, spacy in zip( # type: ignore example_dict["token_annotation"]["ORTH"], example_dict["token_annotation"]["SPACY"], ): text += orth if spacy: text += " " return text