Source code for augmenty.token.replace

import random
from functools import partial
from typing import Callable, Dict, Iterator, List, Optional, Union

import spacy
from spacy.language import Language
from spacy.tokens import Token
from spacy.training import Example

from ..augment_utilities import make_text_from_orth
from .static_embedding_util import static_embedding
from .wordnet_util import init_wordnet


def token_dict_replace_augmenter_v1(
    nlp: Language,
    example: Example,
    level: float,
    replace: Union[Dict[str, List[str]], Dict[str, Dict[str, List[str]]]],  # type: ignore
    ignore_casing: bool,
    getter: Callable[[Token], str],  # type: ignore
    keep_titlecase: bool,
) -> Iterator[Example]:  # type: ignore
    def __replace(t):
        text = t.text
        if ignore_casing is True:
            text = text.lower()
        if text in replace and random.random() < level:
            if isinstance(replace[t.text], dict):
                pos = getter(t)
                if pos in replace[t.text]:
                    text = random.sample(replace[t.text][pos], k=1)[0]  # type: ignore
            else:
                text = random.sample(replace[t.text], k=1)[0]  # type: ignore
            if keep_titlecase is True and t.is_title is True:
                text = text.capitalize()
            return text
        return t.text

    example_dict = example.to_dict()
    example_dict["token_annotation"]["ORTH"] = [__replace(t) for t in example.reference]
    text = make_text_from_orth(example_dict)
    doc = nlp.make_doc(text)
    yield example.from_dict(doc, example_dict)


[docs]@spacy.registry.augmenters("token_dict_replace_v1")  # type: ignore
def create_token__dict_replace_augmenter_v1(
    level: float,
    replace: Union[Dict[str, List[str]], Dict[str, Dict[str, List[str]]]],  # type: ignore
    ignore_casing: bool = True,
    getter: Callable[[Token], str] = lambda token: token.pos_,  # type: ignore
    keep_titlecase: bool = True,
) -> Callable[[Language, Example], Iterator[Example]]:  # type: ignore
    """Creates an augmenter swaps a token with its synonym based on a
    dictionary.

    Args:
        level: Probability to replace token given that it is in synonym dictionary.
        replace: A dictionary of words and a list of their replacement (e.g. synonyms) or a dictionary denoting replacement based on pos tag.
        ignore_casing: When doing the lookup should the model ignore casing?
        getter: A getter function to extract the POS-tag.
        keep_titlecase: Should the model keep the titlecase of the replaced word.

    Returns:
        The augmenter.

    Examples:
        >>> replace = {"act": ["perform", "move", "action"], }
        >>> create_token_dict_replace_augmenter(replace=replace, level=.10)
        >>> # or
        >>> replace = {"act": {"VERB": ["perform", "move"], "NOUN": ["action", "deed"]}}
        >>> create_token_dict_replace_augmenter(replace=replace, level=.10)
    """
    if ignore_casing is True:
        for k in replace:
            replace[k.lower()] = replace[k]  # type: ignore

    return partial(
        token_dict_replace_augmenter_v1,
        level=level,
        replace=replace,
        getter=getter,
        ignore_casing=ignore_casing,
        keep_titlecase=keep_titlecase,
    )


[docs]@spacy.registry.augmenters("wordnet_synonym_v1")  # type: ignore
def create_wordnet_synonym_augmenter_v1(
    level: float,
    lang: Optional[str] = None,  # type: ignore
    respect_pos: bool = True,
    getter: Callable = lambda token: token.pos_,  # type: ignore
    keep_titlecase: bool = True,
) -> Callable[[Language, Example], Iterator[Example]]:  # type: ignore
    """Creates an augmenter swaps a token with its synonym based on a
    dictionary.

    Args:
        lang: Language supplied a ISO 639-1 language code. If None, the lang is based on the language of the
            spacy nlp pipeline used. Possible language codes include:
            "da", "ca", "en", "eu", "fa", "fi", "fr", "gl", "he", "id", "it", "ja",
            "nn", "no", "pl", "pt", "es", "th".
        level: Probability to replace token given that it is in synonym dictionary.
        respect_pos: Should POS-tag be respected?
        getter: A getter function to extract the POS-tag.
        keep_titlecase: Should the model keep the titlecase of the replaced word.

    Returns:
        The augmenter.

    Example:
        >>> english_synonym_augmenter = create_wordnet_synonym_augmenter(level=0.1,
        >>>                                                              lang="en")
    """
    init_wordnet()
    from nltk.corpus import wordnet  # type: ignore

    from .wordnet_util import lang_wn_dict, upos_wn_dict

    def wordnet_synonym_augmenter_v1(
        nlp: Language,
        example: Example,
        level: float,
        lang: Optional[str],  # type: ignore
        getter: Callable,  # type: ignore
        respect_pos: bool,
        keep_titlecase: bool,
    ) -> Iterator[Example]:  # type: ignore
        if lang is None:
            lang = nlp.lang
            lang = lang_wn_dict[lang]  # type: ignore

        def __replace(t):
            word = t.text.lower()
            if random.random() < level and (
                respect_pos is False or getter(t) in upos_wn_dict
            ):
                if respect_pos is True:
                    syns = wordnet.synsets(word, pos=upos_wn_dict[getter(t)], lang=lang)
                else:
                    syns = wordnet.synsets(word, lang=lang)
                if syns:
                    rep = {
                        l
                        for syn in syns
                        for l in syn.lemma_names(lang=lang)  # noqa E741 # type: ignore
                    }
                    if word in rep:
                        rep.remove(word)
                    if rep:
                        text = random.sample(rep, k=1)[0]
                        if keep_titlecase is True and t.is_title is True:
                            text = text.capitalize()
                        return text
            return t.text

        example_dict = example.to_dict()
        example_dict["token_annotation"]["ORTH"] = [
            __replace(t) for t in example.reference
        ]
        text = make_text_from_orth(example_dict)
        doc = nlp.make_doc(text)
        yield example.from_dict(doc, example_dict)

    if lang:
        lang = lang_wn_dict[lang]
    return partial(
        wordnet_synonym_augmenter_v1,
        level=level,
        lang=lang,
        getter=getter,
        keep_titlecase=keep_titlecase,
        respect_pos=respect_pos,
    )


def token_replace_augmenter_v1(
    nlp: Language,
    example: Example,
    level: float,
    replace: Callable[[Token], str],  # type: ignore
    keep_titlecase: bool,
) -> Iterator[Example]:  # type: ignore
    if keep_titlecase is True:

        def __replace(t) -> str:
            text = replace(t)
            if t.is_title is True:
                text = text.capitalize()
            return text

    else:
        __replace = replace  # type: ignore

    example_dict = example.to_dict()
    example_dict["token_annotation"]["ORTH"] = [
        __replace(t) if random.random() < level else t.text for t in example.reference
    ]
    text = make_text_from_orth(example_dict)
    doc = nlp.make_doc(text)
    yield example.from_dict(doc, example_dict)


[docs]@spacy.registry.augmenters("token_replace_v1")  # type: ignore
def create_token_replace_augmenter_v1(
    replace: Callable[[Token], str],  # type: ignore
    keep_titlecase: bool = True,
) -> Callable[[Language, Example], Iterator[Example]]:  # type: ignore
    """Creates an augmenter which replaces a token based on a replace function.

    Args:
        level: Probability to replace token given that it is in synonym dictionary.
        replace: A callable which takes a spaCy Token as input and returns the replaces word as a string.
        keep_titlecase: If original text was uppercased cased should replaces text also be?

    Returns:
        The augmenter.

    Examples:
        >>> def remove_vowels(token):
        ...    vowels = ['a','e','i','o','u', 'y']
        ...    non_vowels = [c for c in token.text if c.lower() not in vowels]
        ...    return ''.join(non_vowels)
        >>> aug = create_token_replace_augmenter(replace=remove_vowels, level=.10)
    """
    return partial(  # type: ignore
        token_replace_augmenter_v1,
        replace=replace,
        keep_titlecase=keep_titlecase,
    )


[docs]@spacy.registry.augmenters("word_embedding_v1")  # type: ignore
def create_word_embedding_augmenter_v1(
    level=float,
    n: int = 10,
    nlp: Optional[Language] = None,  # type: ignore
    keep_titlecase: bool = True,
    ignore_casing: bool = True,
) -> Callable[[Language, Example], Iterator[Example]]:  # type: ignore
    """Creates an augmenter which replaces a token based on a replace function.

    Args:
        level: Probability to replace token given that it is in synonym dictionary.
        n: Number of most similar word vectors to sample from nlp
        nlp: A spaCy text-processing pipeline used for supplying the word vectors if the nlp model supplies doesn't contain word vectors.
        keep_titlecase: If original text was uppercased cased should replaces text also be?
        ignore_case: The word embedding augmenter does not replace a word with the same word. Should this operation ignore casing?

    Returns:
        The augmenter.

    Examples:
        >>> nlp = spacy.load('en_core_web_lg')
        >>> aug = create_word_embedding_augmenter(nlp=nlp, level=.10)
    """

    def replace(
        t: Token,
        n: int,
        ignore_casing: bool,
        embedding: static_embedding,
    ) -> str:
        if embedding.vocab is None:
            embedding.update_from_vocab(t.doc.vocab)
        if embedding.vocab.vectors.shape == (0, 0):  # type: ignore
            raise ValueError(
                "Vectors are empty. Typically this is due to using a transformer-based "
                + "or small spaCy model. Specify nlp for the "
                + "create_word_embedding_augmenter to a spaCy pipeline with static word"
                + " embedding to avoid this issue.",
            )
        if t.text in embedding:
            rep = embedding.most_similar(t.text, n=n + 2)
            if ignore_casing is True:
                rep = [w for w in rep if w.lower() != t.text.lower()][:n]
            else:
                rep = [w for w in rep if w != t.text][:n]
            if rep:
                return random.choice(rep)
        return t.text

    embedding = static_embedding.from_vocab(nlp.vocab) if nlp else static_embedding()

    __replace = partial(replace, n=n, ignore_casing=ignore_casing, embedding=embedding)
    return partial(
        token_replace_augmenter_v1,
        replace=__replace,
        keep_titlecase=keep_titlecase,
        level=level,  # type: ignore
    )