Initial commit

2019-10-20 13:16:49 +02:00
commit 233066caf4
2099 changed files with 360824 additions and 0 deletions
--- a/venv/lib/python3.7/site-packages/nltk/tokenize/mwe.py
+++ b/venv/lib/python3.7/site-packages/nltk/tokenize/mwe.py
@@ -0,0 +1,120 @@
+# Multi-Word Expression tokenizer
+#
+# Copyright (C) 2001-2019 NLTK Project
+# Author: Rob Malouf <rmalouf@mail.sdsu.edu>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Multi-Word Expression Tokenizer
+
+A ``MWETokenizer`` takes a string which has already been divided into tokens and
+retokenizes it, merging multi-word expressions into single tokens, using a lexicon
+of MWEs:
+
+
+    >>> from nltk.tokenize import MWETokenizer
+
+    >>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])
+    >>> tokenizer.add_mwe(('in', 'spite', 'of'))
+
+    >>> tokenizer.tokenize('Testing testing testing one two three'.split())
+    ['Testing', 'testing', 'testing', 'one', 'two', 'three']
+
+    >>> tokenizer.tokenize('This is a test in spite'.split())
+    ['This', 'is', 'a', 'test', 'in', 'spite']
+
+    >>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())
+    ['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']
+
+"""
+from nltk.util import Trie
+
+from nltk.tokenize.api import TokenizerI
+
+
+class MWETokenizer(TokenizerI):
+    """A tokenizer that processes tokenized text and merges multi-word expressions
+    into single tokens.
+    """
+
+    def __init__(self, mwes=None, separator='_'):
+        """Initialize the multi-word tokenizer with a list of expressions and a
+        separator
+
+        :type mwes: list(list(str))
+        :param mwes: A sequence of multi-word expressions to be merged, where
+            each MWE is a sequence of strings.
+        :type separator: str
+        :param separator: String that should be inserted between words in a multi-word
+            expression token. (Default is '_')
+
+        """
+        if not mwes:
+            mwes = []
+        self._mwes = Trie(mwes)
+        self._separator = separator
+
+    def add_mwe(self, mwe):
+        """Add a multi-word expression to the lexicon (stored as a word trie)
+
+        We use ``util.Trie`` to represent the trie. Its form is a dict of dicts. 
+        The key True marks the end of a valid MWE.
+
+        :param mwe: The multi-word expression we're adding into the word trie
+        :type mwe: tuple(str) or list(str)
+
+        :Example:
+
+        >>> tokenizer = MWETokenizer()
+        >>> tokenizer.add_mwe(('a', 'b'))
+        >>> tokenizer.add_mwe(('a', 'b', 'c'))
+        >>> tokenizer.add_mwe(('a', 'x'))
+        >>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}
+        >>> tokenizer._mwes == expected
+        True
+
+        """
+        self._mwes.insert(mwe)
+
+    def tokenize(self, text):
+        """
+
+        :param text: A list containing tokenized text
+        :type text: list(str)
+        :return: A list of the tokenized text with multi-words merged together
+        :rtype: list(str)
+
+        :Example:
+
+        >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
+        >>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split())
+        ['An', "hors+d'oeuvre", 'tonight,', 'sir?']
+        
+        """
+        i = 0
+        n = len(text)
+        result = []
+
+        while i < n:
+            if text[i] in self._mwes:
+                # possible MWE match
+                j = i
+                trie = self._mwes
+                while j < n and text[j] in trie:
+                    trie = trie[text[j]]
+                    j = j + 1
+                else:
+                    if Trie.LEAF in trie:
+                        # success!
+                        result.append(self._separator.join(text[i:j]))
+                        i = j
+                    else:
+                        # no match, so backtrack
+                        result.append(text[i])
+                        i += 1
+            else:
+                result.append(text[i])
+                i += 1
+
+        return result