Source code for augmenty.character.replace

"""Augmenters for randomly or semi-randomly replacing characters."""

import random
from functools import partial
from typing import Callable, Iterator

import spacy
from spacy.language import Language
from spacy.tokens import Token
from spacy.training import Example

from augmenty.util import Augmenter

from ..augment_utilities import make_text_from_orth
from ..keyboard import Keyboard


def char_replace_augmenter_v1(
    nlp: Language,
    example: Example,
    level: float,
    replace: dict,
) -> Iterator[Example]:
    def __replace(t: Token) -> str:
        t_ = []
        for c in t.text:
            if random.random() < level and c in replace:
                c = random.choice(replace[c])
            t_.append(c)
        return "".join(t_)

    example_dict = example.to_dict()
    example_dict["token_annotation"]["ORTH"] = [__replace(t) for t in example.reference]
    text = make_text_from_orth(example_dict)
    doc = nlp.make_doc(text)
    yield example.from_dict(doc, example_dict)


[docs]@spacy.registry.augmenters("char_replace_random_v1")  # type: ignore
def create_char_random_augmenter_v1(
    level: float,
    keyboard: str = "en_qwerty_v1",
) -> Augmenter:
    """Creates an augmenter that replaces a character with a random character
    from the keyboard.

    Args:
        level: The probability to replace a character with a neightbouring character.
        keyboard: A defined keyboard in the keyboard registry. To see a list of all keyboard you can run `augmenty,keyboards()`. Defaults
            to "en_qwerty_v1".

    Returns:
        The augmenter.

    Example:
        >>> import augmenty
        >>> from spacy.lang.en import English
        >>> nlp = English()
        >>> char_random_augmenter = augmenty.load("char_replace_random_v1", level=0.1)
        >>> texts = ["A sample text"]
        >>> list(augmenty.texts(texts, char_random_augmenter, nlp))
        ["A sabple tex3"]
    """

    kb = Keyboard.from_registry(keyboard)
    replace_dict = {k: list(kb.all_keys()) for k in kb.all_keys()}
    return partial(char_replace_augmenter_v1, replace=replace_dict, level=level)


[docs]@spacy.registry.augmenters("char_replace_v1")  # type: ignore
def create_char_replace_augmenter_v1(
    level: float,
    replace: dict,
) -> Augmenter:
    """Creates an augmenter that replaces a character with a random character
    from replace dict.

    Args:
        level: probability to augment character, if document is augmented.
        replace: A dictionary denoting which characters denote potentials
            replace for each character.

    Returns:
        The augmenter function.

    Example:
        >>> create_char_replace_augmenter_v1(level=0.02,
        >>>                                  replace={"æ": ["ae"], "ß": ["ss"]})
    """
    return partial(
        char_replace_augmenter_v1,
        level=level,
        replace=replace,
    )


[docs]@spacy.registry.augmenters("keystroke_error_v1")  # type: ignore
def create_keystroke_error_augmenter_v1(
    level: float,
    distance: float = 1.5,
    keyboard: str = "en_qwerty_v1",
) -> Augmenter:
    """Creates a augmenter which augments a text with plausible typos based on
    keyboard distance.

    Args:
        level: The probability to replace a character with a neightbouring character.
        distance: keyboard distance. Defaults to 1.5 corresponding to neighbouring keys including diagonals.
        keyboard: A defined keyboard in the keyboard registry. To see a list of all keyboard you can run `augmenty,keyboards.get_all()`. Defaults
            to "en_qwerty_v1".

    Returns:
        The augmenter.

    Example:
        >>> import augmenty
        >>> from spacy.lang.en import English
        >>> nlp = English()
        >>> keystroke_error_augmenter = augmenty.load("keystroke_error_v1",
        >>>                                           level=0.1,
        >>>                                           keyboard="en_qwerty_v1")
        >>> texts = ["A sample text"]
        >>> list(augmenty.texts(texts, keystroke_error_augmenter, nlp))
        ["A sajple texr"]
    """
    kb = Keyboard.from_registry(keyboard)
    replace_dict = kb.create_distance_dict(distance=distance)  # type: ignore
    return partial(char_replace_augmenter_v1, replace=replace_dict, level=level)