Source code for niacin.text.en.char

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

"""
Character-based functions for enriching English language data.

Importable functions include:

* add_characters
* add_contractions
* add_fat_thumbs
* add_leet
* add_macbook_keyboard
* add_whitespace
* remove_characters
* remove_contractions
* remove_punctuation
* remove_whitespace
* swap_chars
"""

import collections
import json
from pkg_resources import resource_string
from string import ascii_letters, punctuation
import typing

from scipy import random


LEETMAP = collections.OrderedDict(
    [
        ("anned", "&"),
        ("and", "&"),
        ("what", "wat"),
        ("are", "r"),
        ("ate", "8"),
        ("at", "@"),
        ("one", "1"),
        ("you", "u"),
        ("t", "7"),
        ("o", "0"),
        ("e", "3"),
        ("l", "1"),
    ]
)

CONTRACT = json.loads(
    resource_string("niacin", "data/contractions.json").decode("utf-8")
)
EXPAND = {v: k for k, v in CONTRACT.items()}
NEIGHBORS = json.loads(resource_string("niacin", "data/neighbors.json").decode("utf-8"))


def _sub_chars(string: str, probability: float, mapping: typing.Mapping) -> str:
    """Replace substrings with a given probability.

    Given a mapping, search string one by one for keys and replace with
    the appropriate value, with some probability. If your keys are not mutually
    exclusive (e.g. some part of them overlaps), the order in which they appear
    in the mapping becomes important.

    Args:
        string: text
        probability: probability of replacing a group of characters
        mapping: map of substring -> replacement

    Returns:
        enriched text
    """
    for pattern, sub in mapping.items():
        index = 0
        while 0 <= index < len(string):
            index = string.lower().find(pattern, index)
            if index < 0:
                break
            elif random.binomial(1, probability):
                string = string[:index] + sub + string[index + len(pattern) :]
                index += len(sub)
            else:
                index += len(pattern)
    return string


[docs]def add_fat_thumbs(string: str, p: float = 0.01) -> str: """Replace characters with QWERTY neighbors. One source of typographic mistakes comes from pressing a nearby key on a keyboard (or on a touchscreen). With probability p, replace each character is a string with one from a set of its neighbors. The replacement is chosen using ``random.choice``. Args: string: text p: probability of replacing a character Returns: enriched text """ for index, char in enumerate(string): if char in NEIGHBORS and random.binomial(1, p): new_char = random.choice(NEIGHBORS[char]) string = string[:index] + new_char + string[index + 1 :] return string
[docs]def add_characters(string: str, p: float = 0.01) -> str: """Insert individual characters with probability p. These are chosen randomly from the ascii alphabet (including both upper and lower cases). Args: string: text p: probability of removing a character Returns: enriched text """ for index in reversed(range(len(string))): if random.binomial(1, p): new_char = random.choice(list(ascii_letters)) string = string[:index] + new_char + string[index:] return string
[docs]def add_contractions(string: str, p: float = 0.5) -> str: """Replace common word pairs with their contraction. This is done even when the contraction introduces ambiguity, as this is seen as preserving the semantics (arXiv:1812.04718_). Args: string: text p: probability of a word pair being replaced Returns: enriched text .. _arXiv:1812.04718 : https://arxiv.org/abs/1812.04718 """ return _sub_chars(string, probability=p, mapping=CONTRACT)
[docs]def remove_contractions(string: str, p: float = 0.5) -> str: """Expand a contraction into individual tokens. See (arXiv:1812.04718_). Args: string: text p: probability of a word pair being replaced Returns: enriched text .. _arXiv:1812.04718 : https://arxiv.org/abs/1812.04718 """ return _sub_chars(string, probability=p, mapping=EXPAND)
[docs]def add_leet(string: str, p: float = 0.2) -> str: """Replace character groups with visually or aurally similar ones. Character groups given in ``LEETMAP.keys()`` are searched for in priority (roughly from largest to smallest), and are replaced with some associated value with probability p. E.g.: | "Hello, you are banned" | "Hello, you are b&" | "Hello, you r b&" | "Hello, u r b&" | "H3110, u r b&" Args: string: text p: condtional probability of replacing a character group Returns: enriched text """ return _sub_chars(string, probability=p, mapping=LEETMAP)
[docs]def add_macbook_keyboard(string: str, p: float = 0.1) -> str: """Repeats or removes each character with probability p. Bad keyboards can be a common source of typographical errors by repeating characters or by omitting them, e.g. because the individual keys get stuck. With probability p, we modify a character, with a 50/50 chance of either removing it, or repeating it twice. Args: string: text p: probability of changing letter count Returns: enriched text """ for index in reversed(range(len(string))): if random.binomial(1, p): count = random.choice([0, 2]) string = string[:index] + string[index]*count + string[index+1:] return string
[docs]def add_whitespace(string: str, p: float = 0.01) -> str: """Add a spacebar character with probability p. Extraneous whitespace, especially when it occurs in the middle of an important word, can be reduce the effectiveness of models which depend on word tokenizers as part of the data pipeline. Args: string: text p: probability of adding a space character Returns: enriched text """ space = " " for index in range(len(string), -1, -1): if random.binomial(1, p): string = string[:index] + space + string[index:] return string
[docs]def remove_characters(string: str, p: float = 0.01) -> str: """Remove individual characters with probability p. Args: string: text p: probability of removing a character Returns: enriched text """ for index in reversed(range(len(string))): if random.binomial(1, p): string = string[:index] + string[index + 1 :] return string
[docs]def remove_punctuation(string: str, p: float = 0.25) -> str: """Remove punctuation with probability p. The removal of punctuation is a common data cleaning step for fast but high bias models and data processing algorithms. When that punctuation occurs in the middle of the word (e.g. indicating possessiveness), its removal may change the semantics of the string. Args: string: text p: probability of removing punctuation Returns: enriched text """ mapping = {k: "" for k in punctuation} return _sub_chars(string, probability=p, mapping=mapping)
[docs]def remove_whitespace(string: str, p: float = 0.1) -> str: """Remove a spacebar character with probability p. Selective removal of whitespace can be reduce the effectiveness of word- based models, or those which depend on word tokenizers as part of the data pipeline. Args: string: text p: probability of removing a space character Returns: enriched text """ mapping = {" ": ""} return _sub_chars(string, probability=p, mapping=mapping)
[docs]def swap_chars(string: str, p: float = 0.05) -> str: """Swap adjacent characters. With probability p, swap two adjacent characters in a string. No character gets swapped more than once, so cannot end up in any locations that are not adjacent to its starting position. .. note:: to keep the interface consistent, niacin's implementation acts on a probability p, applied n-1 times, where n is the total number of characters in the string. The implementation in noisemix_ (called ``flip_chars``) chooses two letters at random and exchanges their positions, exactly once per string. Args: string: text p: probability of swapping two characters Returns: enriched text .. _noisemix : https://github.com/noisemix/noisemix """ chars = list(string) index = 0 while index < len(chars) - 1: if random.binomial(1, p): chars[index], chars[index + 1] = chars[index + 1], chars[index] index += 2 else: index += 1 return "".join(chars)