Source code for lightwood.helpers.text

"""
*******************************************************
 * Copyright (C) 2017 MindsDB Inc. <copyright@mindsdb.com>
 *
 * This file is part of MindsDB Server.
 *
 * MindsDB Server can not be copied and/or distributed without the express
 * permission of MindsDB Inc
 *******************************************************
"""
from collections import Counter, defaultdict
import string
import json
import re
import hashlib
from typing import Iterable
import numpy as np
import scipy.stats as st
import langdetect
import nltk
from lightwood.api.dtype import dtype


try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    from nltk.corpus import stopwords
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords', quiet=True)


def get_language_dist(data):
    langdetect.DetectorFactory.seed = 0
    lang_dist = defaultdict(lambda: 0)
    lang_dist['Unknown'] = 0
    lang_probs_cache = dict()
    for text in data:
        text = str(text)
        text = ''.join([c for c in text if c not in string.punctuation])
        if text not in lang_probs_cache:
            try:
                lang_probs = langdetect.detect_langs(text)
            except langdetect.lang_detect_exception.LangDetectException:
                lang_probs = []
            lang_probs_cache[text] = lang_probs

        lang_probs = lang_probs_cache[text]
        if len(lang_probs) > 0 and lang_probs[0].prob > 0.90:
            lang_dist[lang_probs[0].lang] += 1
        else:
            lang_dist['Unknown'] += 1

    return dict(lang_dist)


[docs]def analyze_sentences(data): """ :param data: list of str :returns: tuple( int: nr words total, dict: word_dist, dict: nr_words_dist ) """ nr_words = 0 word_dist = defaultdict(int) nr_words_dist = defaultdict(int) stop_words = set(stopwords.words('english')) for text in map(str, data): text = text.lower() tokens = tokenize_text(text) tokens_no_stop = [x for x in tokens if x not in stop_words] nr_words_dist[len(tokens)] += 1 nr_words += len(tokens) for tok in tokens_no_stop: word_dist[tok] += 1 return nr_words, dict(word_dist), dict(nr_words_dist)
def word_tokenize(string): sep_tag = '{#SEP#}' for separator in WORD_SEPARATORS: string = str(string).replace(separator, sep_tag) words_split = string.split(sep_tag) num_words = len([word for word in words_split if word and word not in ['', None]]) return num_words def clean_float(val): if isinstance(val, (int, float)): return float(val) if isinstance(val, float): return val val = str(val).strip(' ') val = val.replace(',', '.') val = val.rstrip('"').lstrip('"') if val in ('', '.', 'None', 'nan'): return None return float(val)
[docs]def gen_chars(length, character): """ # lambda to Generates a string consisting of `length` consiting of repeating `character` :param length: :param character: :return: """ return ''.join([character for _ in range(length)])
[docs]def cast_string_to_python_type(string): """ Returns None, an integer, float or a string from a string""" if string is None or string == '': return None if string.isnumeric(): # Did you know you can write fractions in unicode, and they are numeric but can't be cast to integers !? try: return int(string) except Exception: return None try: return clean_float(string) except Exception: return string
def splitRecursive(word, tokens): words = [str(word)] for token in tokens: new_split = [] for word in words: new_split += word.split(token) words = new_split words = [word for word in words if word not in ['', None]] return words def hashtext(cell): text = json.dumps(cell) return hashlib.md5(text.encode('utf8')).hexdigest() def _is_foreign_key_name(name): for endings in ['id', 'ID', 'Id']: for add in ['-', '_', ' ']: if name.endswith(add + endings): return True for endings in ['ID', 'Id']: if name.endswith(endings): return True return False def _is_identifier_name(name): for keyword in ['account', 'uuid', 'identifier', 'user']: if keyword in name: return True return False
[docs]def isascii(string): """ Used instead of str.isascii because python 3.6 doesn't have that """ return all(ord(c) < 128 for c in string)
def extract_digits(point): return ''.join([char for char in str(point) if char.isdigit()]) def get_pct_auto_increment(data): int_data = [] for point in [extract_digits(x) for x in data]: try: int_data.append(int(point)) except Exception: pass int_data = sorted(int_data) prev_nr = int_data[0] increase_by_one = 0 for nr in int_data[1:]: diff = nr - prev_nr if diff == 1: increase_by_one += 1 prev_nr = nr return increase_by_one / (len(data) - 1) def get_identifier_description_mp(arg_tup): data, column_name, data_dtype = arg_tup return get_identifier_description(data, column_name, data_dtype) def get_identifier_description(data: Iterable, column_name: str, data_dtype: dtype): data = list(data) if isinstance(data[0], list): nr_unique = len(set(tuple(x) for x in data)) else: nr_unique = len(set(data)) if nr_unique == 1: return 'No Information' unique_pct = nr_unique / len(data) spaces = [len(str(x).split(' ')) - 1 for x in data] mean_spaces = np.mean(spaces) # Detect auto incrementing index # -- some cases where I guess people do want to use this for learning, so ignoring this check for now... # if data_dtype == dtype.integer: # if get_pct_auto_increment(data) > 0.98 and unique_pct > 0.99: # return 'Auto-incrementing identifier' # Detect hash all_same_length = all(len(str(data[0])) == len(str(x)) for x in data) uuid_charset = set('0123456789abcdefABCDEF-') all_uuid_charset = all(set(str(x)).issubset(uuid_charset) for x in data) is_uuid = all_uuid_charset and all_same_length if all_same_length and len(data) == nr_unique and data_dtype not in (dtype.integer, dtype.float): str_data = [str(x) for x in data] randomness_per_index = [] for i, _ in enumerate(str_data[0]): N = len(set(x[i] for x in str_data)) S = st.entropy([*Counter(x[i] for x in str_data).values()]) randomness_per_index.append(S / np.log(N)) if np.mean(randomness_per_index) > 0.95: return 'Hash-like identifier' # Detect foreign key if data_dtype == dtype.integer: if _is_foreign_key_name(column_name): return 'Foreign key' if _is_identifier_name(column_name) or data_dtype in (dtype.categorical, dtype.binary): if unique_pct > 0.98: if is_uuid: return 'UUID' else: return 'Unknown identifier' # Everything is unique and it's too short to be rich text if data_dtype in (dtype.categorical, dtype.binary, dtype.short_text, dtype.rich_text) and \ unique_pct > 0.99999 and mean_spaces < 1: return 'Unknown identifier' return None def contains_alnum(text): for c in text: if c.isalnum(): return True return False def decontracted(phrase): # specific phrase = re.sub(r"won\'t", "will not", phrase) phrase = re.sub(r"can\'t", "can not", phrase) # general phrase = re.sub(r"n\'t", " not", phrase) phrase = re.sub(r"\'re", " are", phrase) phrase = re.sub(r"\'s", " is", phrase) phrase = re.sub(r"\'d", " would", phrase) phrase = re.sub(r"\'ll", " will", phrase) phrase = re.sub(r"\'t", " not", phrase) phrase = re.sub(r"\'ve", " have", phrase) phrase = re.sub(r"\'m", " am", phrase) return phrase def tokenize_text(text): return [t.lower() for t in nltk.word_tokenize(decontracted(text)) if contains_alnum(t)]