Source code for augmenty.character.replace

"""Augmenters for randomly or semi-randomly replacing characters."""

import random
from functools import partial
from typing import Callable, Iterator

import spacy
from spacy.language import Language
from spacy.tokens import Token
from spacy.training import Example

from augmenty.util import Augmenter

from ..augment_utilities import make_text_from_orth
from ..keyboard import Keyboard


def char_replace_augmenter_v1(
    nlp: Language,
    example: Example,
    level: float,
    replace: dict,
) -> Iterator[Example]:
    def __replace(t: Token) -> str:
        t_ = []
        for c in t.text:
            if random.random() < level and c in replace:
                c = random.choice(replace[c])
            t_.append(c)
        return "".join(t_)

    example_dict = example.to_dict()
    example_dict["token_annotation"]["ORTH"] = [__replace(t) for t in example.reference]
    text = make_text_from_orth(example_dict)
    doc = nlp.make_doc(text)
    yield example.from_dict(doc, example_dict)


[docs]@spacy.registry.augmenters("char_replace_random_v1") # type: ignore def create_char_random_augmenter_v1( level: float, keyboard: str = "en_qwerty_v1", ) -> Augmenter: """Creates an augmenter that replaces a character with a random character from the keyboard. Args: level: The probability to replace a character with a neightbouring character. keyboard: A defined keyboard in the keyboard registry. To see a list of all keyboard you can run `augmenty,keyboards()`. Defaults to "en_qwerty_v1". Returns: The augmenter. Example: >>> import augmenty >>> from spacy.lang.en import English >>> nlp = English() >>> char_random_augmenter = augmenty.load("char_replace_random_v1", level=0.1) >>> texts = ["A sample text"] >>> list(augmenty.texts(texts, char_random_augmenter, nlp)) ["A sabple tex3"] """ kb = Keyboard.from_registry(keyboard) replace_dict = {k: list(kb.all_keys()) for k in kb.all_keys()} return partial(char_replace_augmenter_v1, replace=replace_dict, level=level)
[docs]@spacy.registry.augmenters("char_replace_v1") # type: ignore def create_char_replace_augmenter_v1( level: float, replace: dict, ) -> Augmenter: """Creates an augmenter that replaces a character with a random character from replace dict. Args: level: probability to augment character, if document is augmented. replace: A dictionary denoting which characters denote potentials replace for each character. Returns: The augmenter function. Example: >>> create_char_replace_augmenter_v1(level=0.02, >>> replace={"æ": ["ae"], "ß": ["ss"]}) """ return partial( char_replace_augmenter_v1, level=level, replace=replace, )
[docs]@spacy.registry.augmenters("keystroke_error_v1") # type: ignore def create_keystroke_error_augmenter_v1( level: float, distance: float = 1.5, keyboard: str = "en_qwerty_v1", ) -> Augmenter: """Creates a augmenter which augments a text with plausible typos based on keyboard distance. Args: level: The probability to replace a character with a neightbouring character. distance: keyboard distance. Defaults to 1.5 corresponding to neighbouring keys including diagonals. keyboard: A defined keyboard in the keyboard registry. To see a list of all keyboard you can run `augmenty,keyboards.get_all()`. Defaults to "en_qwerty_v1". Returns: The augmenter. Example: >>> import augmenty >>> from spacy.lang.en import English >>> nlp = English() >>> keystroke_error_augmenter = augmenty.load("keystroke_error_v1", >>> level=0.1, >>> keyboard="en_qwerty_v1") >>> texts = ["A sample text"] >>> list(augmenty.texts(texts, keystroke_error_augmenter, nlp)) ["A sajple texr"] """ kb = Keyboard.from_registry(keyboard) replace_dict = kb.create_distance_dict(distance=distance) # type: ignore return partial(char_replace_augmenter_v1, replace=replace_dict, level=level)