Source code for augmenty.token.spacing

import random
from functools import partial
from typing import Callable, Iterator

import spacy
from spacy.language import Language
from spacy.training import Example

from ..augment_utilities import make_text_from_orth


def letter_spacing_augmenter_v1(
    nlp: Language,
    example: Example,
    level: float,
) -> Iterator[Example]:  # type: ignore
    def __spacing(t):
        if random.random() < level:
            return " ".join([c for c in t.text])
        return t.text

    example_dict = example.to_dict()
    example_dict["token_annotation"]["ORTH"] = [__spacing(t) for t in example.y]
    text = make_text_from_orth(example_dict)
    doc = nlp.make_doc(text)
    yield example.from_dict(doc, example_dict)


[docs]@spacy.registry.augmenters("letter_spacing_augmenter_v1") # type: ignore def create_letter_spacing_augmenter_v1( level: float, ) -> Callable[[Language, Example], Iterator[Example]]: # type: ignore """Typically casing is used to add emphasis to words, but letter spacing has also been used to add e m p h a s i s to words (e.g. by Grundtvig; Baunvig, Jarvis and Nielbo, 2020). This augmenter randomly adds letter spacing emphasis to words. This augmentation which are human readable, but which are clearly challenging for systems using a white-space centric tokenization. Args: level: The probability add grundtvigian letter spacing emphasis. Returns: The augmenter. """ return partial(letter_spacing_augmenter_v1, level=level)
def spacing_insertion_augmenter_v1( nlp: Language, example: Example, level: float, max_insertions: int, ) -> Iterator[Example]: # type: ignore def __spacing(t): insertions = 0 text = [] for c in t.text[:-1]: # can't put a space at the end text.append(c) if random.random() < level and insertions < max_insertions: insertions += 1 text.append(" ") text.append(t.text[-1]) return "".join(text) example_dict = example.to_dict() example_dict["token_annotation"]["ORTH"] = [__spacing(t) for t in example.y] text = make_text_from_orth(example_dict) doc = nlp.make_doc(text) yield example.from_dict(doc, example_dict)
[docs]@spacy.registry.augmenters("spacing_insertion_v1") # type: ignore def create_spacing_insertion_augmenter_v1( level: float, max_insertions: int = 1, ) -> Callable[[Language, Example], Iterator[Example]]: # type: ignore """Creates and augmneter that randomly adds a space after a chara cter. Tokens are kept the same. Args: level: The probability to add a space after a character. max_insertions: Maximum number of insertions pr. word. Returns: The augmenter. """ return partial( spacing_insertion_augmenter_v1, level=level, max_insertions=max_insertions, )