Source code for niacin.text.en.sentence

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

"""
Sentence-based functions for enriching English language data.

Importable functions include:

* add_applause
* add_backtranslation
* add_bytes
* add_love
"""

import regex
from scipy import random
import warnings


P_SPACE = regex.compile(r"(\s+)")


class _Translator:
    """Wrapper around fairseq language models (arXiv:1904.01038_).

    On first initialization, the instance loads language models and stores
    them as attributes on the class. New instances after this do not reload
    them. Currently implements translation from English to German, and the
    reverse.

    Attributes
    ----------
    en2de: callable
        translate from English to German
    de2en: callable
        translate from German to English
    translators: dict
        mapping of model names to model objects

    .. _arXiv:1904.01038 : https://arxiv.org/abs/1904.01038
    """

    translators: dict = {}

    def __init__(self):
        self.load_models()
        self.en2de = self.translators["en2de"].translate
        self.de2en = self.translators["de2en"].translate

    @classmethod
    def load_models(cls, force: bool = False):
        warnings.warn(
            "Backtranslation uses large translation models (~6GB) and can "
            "hours to download on the first use."
        )
        try:
            import torch
        except ImportError:
            raise ImportError(
                "torch not found - you may need to install extras with"
                "'pip install niacin[all]'"
            )
        if force or "en2de" not in cls.translators:
            cls.translators["en2de"] = torch.hub.load(
                "pytorch/fairseq",
                "transformer.wmt19.en-de.single_model",
                tokenizer="moses",
                bpe="fastbpe",
            )
        if force or "de2en" not in cls.translators:
            cls.translators["de2en"] = torch.hub.load(
                "pytorch/fairseq",
                "transformer.wmt19.de-en.single_model",
                tokenizer="moses",
                bpe="fastbpe",
            )
        # turn off dropout
        for name, model in cls.translators.items():
            model.eval()

    def backtranslate(self, string: str) -> str:
        return self.de2en(self.en2de(string))


[docs]def add_applause(string: str, p: float = 0.1) -> str: """Replace whitespace with clapping emojis. In online communities, replacing whitespace delimiters with the clapping emoji (\U0001f44f) is a way of indicating emphasis, possibly as a typographic replacement for the baton gesture. This has the unintended consequence of rendering word or token-based models ineffective. Args: string: text p: probability of replacing every whitespace character Returns: enriched text """ if random.binomial(1, p): string = P_SPACE.sub("\U0001f44f", string) return string
[docs]def add_bytes(string: str, p: float = 0.1, length: int = 100) -> str: """Add random bytes to the end of a sentence. A common spam disguising technique includes appending random sequences of bytes to the end of text data. This can be effective against character based models, or loglinear models which include total length and character distribution as features. Random bytes are decoded as utf-8 with errors ignored, so the total number of characters will typically be smaller than the length input parameter. Args: string: text p: probability adding random bytes length: number of random bytes Returns: enriched text """ if random.binomial(1, p): string = string + random.bytes(length).decode("utf-8", errors="replace") return string
[docs]def add_love(string: str, p: float = 0.1) -> str: """Add love to the end of a sentence. Appends ``' love'`` to the end of a string. Including a word with large positive sentiment can be used to confuse sentiment-based filters for input data (arXiv:1808.0911_). Args: string: text p: probability of adding ' love' to a sentence Returns: enriched text .. _arXiv:1808.0911 : https://arxiv.org/abs/1808.09115 """ if random.binomial(1, p): string = string + " love" return string
[docs]def add_backtranslation(string: str, p: float = 0.5) -> str: """Translate a sentence into another language and back. Use a fairseq model to translate a sentence from Enligh into German, then translate the German back into English with another fairseq model (arXiv:1904.01038_). Anecdotally, this generates sequences with similar semantic content, but different word choices, and is a popular way to augment small datasets in high resource languages (arXiv:1904.12848_). .. warning:: Backtranslation uses large neural machine translation (NMT) models. The first time you call this function, it will download and cache up to 6GB of data, which can take hours depending on your connection speed. The slowness only happens once, but the model size will impact memory usage every time you use this function. Args: string: text p: probability of backtranslating a sentence Returns: enriched text .. _arXiv:1904.01038 : https://arxiv.org/abs/1904.01038 .. _arXiv:1904.12848 : https://arxiv.org/abs/1904.12848 """ # the fairseq models do weird stuff with empty strings if not string: return string if random.binomial(1, p): t = _Translator() string = t.backtranslate(string) return string