Initial commit

2019-10-20 13:16:49 +02:00
commit 233066caf4
2099 changed files with 360824 additions and 0 deletions
--- a/venv/lib/python3.7/site-packages/nltk/classify/textcat.py
+++ b/venv/lib/python3.7/site-packages/nltk/classify/textcat.py
@@ -0,0 +1,208 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Language ID module using TextCat algorithm
+#
+# Copyright (C) 2001-2019 NLTK Project
+# Author: Avital Pekker <avital.pekker@utoronto.ca>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+A module for language identification using the TextCat algorithm.
+An implementation of the text categorization algorithm
+presented in Cavnar, W. B. and J. M. Trenkle,
+"N-Gram-Based Text Categorization".
+
+The algorithm takes advantage of Zipf's law and uses
+n-gram frequencies to profile languages and text-yet to
+be identified-then compares using a distance measure.
+
+Language n-grams are provided by the "An Crubadan"
+project. A corpus reader was created separately to read
+those files.
+
+For details regarding the algorithm, see:
+http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
+
+For details about An Crubadan, see:
+http://borel.slu.edu/crubadan/index.html
+"""
+
+# Ensure that literal strings default to unicode rather than str.
+from __future__ import print_function, unicode_literals
+
+from nltk.compat import PY3
+from nltk.util import trigrams
+
+if PY3:
+    from sys import maxsize
+else:
+    from sys import maxint
+
+# Note: this is NOT "re" you're likely used to. The regex module
+# is an alternative to the standard re module that supports
+# Unicode codepoint properties with the \p{} syntax.
+# You may have to "pip install regx"
+try:
+    import regex as re
+except ImportError:
+    re = None
+######################################################################
+##  Language identification using TextCat
+######################################################################
+
+
+class TextCat(object):
+
+    _corpus = None
+    fingerprints = {}
+    _START_CHAR = "<"
+    _END_CHAR = ">"
+
+    last_distances = {}
+
+    def __init__(self):
+        if not re:
+            raise EnvironmentError(
+                "classify.textcat requires the regex module that "
+                "supports unicode. Try '$ pip install regex' and "
+                "see https://pypi.python.org/pypi/regex for "
+                "further details."
+            )
+
+        from nltk.corpus import crubadan
+
+        self._corpus = crubadan
+        # Load all language ngrams into cache
+        for lang in self._corpus.langs():
+            self._corpus.lang_freq(lang)
+
+    def remove_punctuation(self, text):
+        ''' Get rid of punctuation except apostrophes '''
+        return re.sub(r"[^\P{P}\']+", "", text)
+
+    def profile(self, text):
+        ''' Create FreqDist of trigrams within text '''
+        from nltk import word_tokenize, FreqDist
+
+        clean_text = self.remove_punctuation(text)
+        tokens = word_tokenize(clean_text)
+
+        fingerprint = FreqDist()
+        for t in tokens:
+            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
+            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
+
+            for cur_trigram in token_trigrams:
+                if cur_trigram in fingerprint:
+                    fingerprint[cur_trigram] += 1
+                else:
+                    fingerprint[cur_trigram] = 1
+
+        return fingerprint
+
+    def calc_dist(self, lang, trigram, text_profile):
+        ''' Calculate the "out-of-place" measure between the
+            text and language profile for a single trigram '''
+
+        lang_fd = self._corpus.lang_freq(lang)
+        dist = 0
+
+        if trigram in lang_fd:
+            idx_lang_profile = list(lang_fd.keys()).index(trigram)
+            idx_text = list(text_profile.keys()).index(trigram)
+
+            # print(idx_lang_profile, ", ", idx_text)
+            dist = abs(idx_lang_profile - idx_text)
+        else:
+            # Arbitrary but should be larger than
+            # any possible trigram file length
+            # in terms of total lines
+            if PY3:
+                dist = maxsize
+            else:
+                dist = maxint
+
+        return dist
+
+    def lang_dists(self, text):
+        ''' Calculate the "out-of-place" measure between
+            the text and all languages '''
+
+        distances = {}
+        profile = self.profile(text)
+        # For all the languages
+        for lang in self._corpus._all_lang_freq.keys():
+            # Calculate distance metric for every trigram in
+            # input text to be identified
+            lang_dist = 0
+            for trigram in profile:
+                lang_dist += self.calc_dist(lang, trigram, profile)
+
+            distances[lang] = lang_dist
+
+        return distances
+
+    def guess_language(self, text):
+        ''' Find the language with the min distance
+            to the text and return its ISO 639-3 code '''
+        self.last_distances = self.lang_dists(text)
+
+        return min(self.last_distances, key=self.last_distances.get)
+        #################################################')
+
+
+def demo():
+    from nltk.corpus import udhr
+
+    langs = [
+        'Kurdish-UTF8',
+        'Abkhaz-UTF8',
+        'Farsi_Persian-UTF8',
+        'Hindi-UTF8',
+        'Hawaiian-UTF8',
+        'Russian-UTF8',
+        'Vietnamese-UTF8',
+        'Serbian_Srpski-UTF8',
+        'Esperanto-UTF8',
+    ]
+
+    friendly = {
+        'kmr': 'Northern Kurdish',
+        'abk': 'Abkhazian',
+        'pes': 'Iranian Persian',
+        'hin': 'Hindi',
+        'haw': 'Hawaiian',
+        'rus': 'Russian',
+        'vie': 'Vietnamese',
+        'srp': 'Serbian',
+        'epo': 'Esperanto',
+    }
+
+    tc = TextCat()
+
+    for cur_lang in langs:
+        # Get raw data from UDHR corpus
+        raw_sentences = udhr.sents(cur_lang)
+        rows = len(raw_sentences) - 1
+        cols = list(map(len, raw_sentences))
+
+        sample = ''
+
+        # Generate a sample text of the language
+        for i in range(0, rows):
+            cur_sent = ''
+            for j in range(0, cols[i]):
+                cur_sent += ' ' + raw_sentences[i][j]
+
+            sample += cur_sent
+
+        # Try to detect what it is
+        print('Language snippet: ' + sample[0:140] + '...')
+        guess = tc.guess_language(sample)
+        print('Language detection: %s (%s)' % (guess, friendly[guess]))
+        print('#' * 140)
+
+
+if __name__ == '__main__':
+    demo()