Source code for niacin.text.en.word

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

"""
Word-based functions for enriching English language data.

Importable functions include:

* add_hypernyms
* add_hyponyms
* add_misspelling
* add_parens
* add_synonyms
* remove_articles
* swap_words
"""

import collections
import json
from pkg_resources import resource_string
import typing

from scipy import random
from nltk import wordnet


HYPERNYMS = json.loads(resource_string("niacin", "data/hypernyms.json").decode("utf-8"))
HYPONYMS = json.loads(resource_string("niacin", "data/hyponyms.json").decode("utf-8"))
MISSPELLINGS = json.loads(
    resource_string("niacin", "data/misspellings.json").decode("utf-8")
)
SYNONYMS = json.loads(resource_string("niacin", "data/synonyms.json").decode("utf-8"))


ARTICLES = ("the", "a", "an", "these", "those", "his", "hers", "their")


def _get_wordnet():
    try:
        wn = wordnet.WordNetLemmatizer()
        wn.lemmatize("this is a test")
    except:
        print("Missing wordnet data -- attempting to download")
        import nltk

        nltk.download("wordnet")
        wn = wordnet.WordNetLemmatizer()
    return wn


def _sub_words(string: str, probability: float, mapping: typing.Mapping) -> str:
    """Replace words with a given probability.

    Split a string into words (the naïve way, on whitespace). Then, search
    word list for each key in the mapping, and replace it with its value with
    some probability. Then join them back together with a single whitespace.

    Args:
        string: text
        probability: probability of replacing a word
        mapping: map of substring -> replacement

    Returns:
        enriched text
    """
    words = string.split()
    for pattern, sub in mapping.items():
        for index, word in enumerate(words):
            if (word.lower() == pattern) and random.binomial(1, probability):
                words[index] = sub
    return " ".join(word for word in words if word)


[docs]def add_hypernyms(string: str, p: float = 0.01) -> str:
    """Replace word with a higher-level category.

    A common negative sampling technique involves replacing words
    in a sentence with a word that has the same general meaning, but
    is too general for the context, e.g.:

    "all dogs go to heaven" -> "all quadrupeds go to place"

    The replacement words are drawn from wordnet (wordnet_). For
    words with more than one possible replacement, one is selected using
    ``random.choice``.

    Args:
        string: text
        p: conditional probability of replacing a word

    Returns:
        enriched text

    .. _wordnet: https://wordnet.princeton.edu/
    """
    wn = _get_wordnet()
    words = string.split()
    lemmas = [wn.lemmatize(w) for w in words]
    for index, lemma in enumerate(lemmas):
        if (lemma in HYPERNYMS) and random.binomial(1, p):
            words[index] = random.choice(HYPERNYMS[lemma])
    return " ".join(words)


[docs]def add_hyponyms(string: str, p: float = 0.01) -> str:
    """Replace word with a lower-level category.

    A common negative sampling technique involves replacing words
    in a sentence with a word that has the same general meaning, but
    is too specific for the context, e.g.:

    "all dogs go to heaven" -> "all Australian shepherds go to heaven"

    The replacement words are drawn from wordnet (wordnet_). For
    words with more than one possible replacement, one is selected using
    ``random.choice``.

    Args:
        string: text
        p: conditional probability of replacing a word

    Returns:
        enriched text

    .. _wordnet: https://wordnet.princeton.edu/
    """
    wn = _get_wordnet()
    words = string.split()
    lemmas = [wn.lemmatize(w) for w in words]
    for index, lemma in enumerate(lemmas):
        if (lemma in HYPONYMS) and random.binomial(1, p):
            words[index] = random.choice(HYPONYMS[lemma])
    return " ".join(words)


[docs]def add_misspelling(string: str, p: float = 0.1) -> str:
    """Replace words with common misspellings.

    Replaces a word with a common way that word is mispelled, given one or
    more known, common misspellings taken from the Wikipedia spelling
    correction corpus (wikipedia_). For words with more than one common
    misspelling, one is chosen using ``random.choice``.

    Args:
        string: text
        p: conditional probability of replacing a word

    Returns:
        enriched text

    .. _wikipedia: https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings
    """
    words = string.split()
    for index, word in enumerate(words):
        if (word in MISSPELLINGS) and random.binomial(1, p):
            words[index] = random.choice(MISSPELLINGS[word])
    return " ".join(words)


[docs]def add_parens(string: str, p: float = 0.01) -> str:
    """Wrap individual words in triple parentheses.

    Adds parentheses before and after a word, e.g. ``(((term)))``.
    This is a common tactic for disrupting tokenizers and other kinds
    of word based models.

    Args:
        string: text
        p: probability of wrapping a word

    Returns:
        enriched text
    """
    words = string.split()
    for index, word in enumerate(words):
        if random.binomial(1, p):
            words[index] = "(((" + word + ")))"
    return " ".join(words)


[docs]def add_synonyms(string: str, p: float = 0.01) -> str:
    """Replace word with one that has a close meaning.

    A common data augmentation technique involves replacing words
    in a sentence with a word that has the same general meaning
    (arxiv:1509.01626_), e.g.:

    "all dogs go to heaven" -> "all domestic dog depart to heaven"

    The replacement words are drawn from wordnet (wordnet_). For
    words with more than one possible replacement, one is selected using
    ``random.choice``.

    Args:
        string: text
        p: conditional probability of replacing a word

    Returns:
        enriched text

    .. _arxiv:1509.01626 : https://arxiv.org/abs/1509.01626
    .. _wordnet: https://wordnet.princeton.edu/
    """
    wn = _get_wordnet()
    words = string.split()
    lemmas = [wn.lemmatize(w) for w in words]
    for index, lemma in enumerate(lemmas):
        if (lemma in SYNONYMS) and random.binomial(1, p):
            words[index] = random.choice(SYNONYMS[lemma])
    return " ".join(words)


[docs]def remove_articles(string: str, p: float = 1.0) -> str:
    """Remove articles from text data.

    Matches and removes the following articles:

    * the
    * a
    * an
    * these
    * those
    * his
    * hers
    * their

    with probability p.

    Args:
        string: text
        p: probability of removing a given article

    Returns:
        enriched text
    """
    mapping = {article: "" for article in ARTICLES}
    return _sub_words(string, probability=p, mapping=mapping)


[docs]def swap_words(string: str, p: float = 0.01) -> str:
    """Swap adjacent words.

    With probability p, swap two adjacent words in a string. This preserves
    the vocabulary of input text while changing token order, and in
    theory should provide more of a challenge to recursive models than ones
    that rely on lexical distributions.

    .. note::
        to keep the interface consistent, niacin's implementation acts on
        a probability p, applied n-1 times, where n is the total number of words
        in the string. In the original paper (eda_), two words are chosen n times
        and swapped, where n is a count number given as a hyperparameter.

    Args:
        string: text
        p: probability of swapping two words

    Returns:
        enriched text

    .. _eda : https://arxiv.org/abs/1901.11196
    """
    words = string.split()
    index = 0
    while index < len(words) - 1:
        if random.binomial(1, p):
            words[index], words[index + 1] = words[index + 1], words[index]
            index += 2
        else:
            index += 1
    return " ".join(words)
Source code for niacin.text.en.word

niacin

Navigation

Related Topics