Initial commit

2019-10-20 13:16:49 +02:00
commit 233066caf4
2099 changed files with 360824 additions and 0 deletions
--- a/venv/lib/python3.7/site-packages/nltk/tokenize/api.py
+++ b/venv/lib/python3.7/site-packages/nltk/tokenize/api.py
@@ -0,0 +1,78 @@
+# Natural Language Toolkit: Tokenizer Interface
+#
+# Copyright (C) 2001-2019 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Tokenizer Interface
+"""
+
+from abc import ABCMeta, abstractmethod
+from six import add_metaclass
+
+from nltk.internals import overridden
+from nltk.tokenize.util import string_span_tokenize
+
+
+@add_metaclass(ABCMeta)
+class TokenizerI(object):
+    """
+    A processing interface for tokenizing a string.
+    Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
+    """
+
+    @abstractmethod
+    def tokenize(self, s):
+        """
+        Return a tokenized copy of *s*.
+
+        :rtype: list of str
+        """
+        if overridden(self.tokenize_sents):
+            return self.tokenize_sents([s])[0]
+
+    def span_tokenize(self, s):
+        """
+        Identify the tokens using integer offsets ``(start_i, end_i)``,
+        where ``s[start_i:end_i]`` is the corresponding token.
+
+        :rtype: iter(tuple(int, int))
+        """
+        raise NotImplementedError()
+
+    def tokenize_sents(self, strings):
+        """
+        Apply ``self.tokenize()`` to each element of ``strings``.  I.e.:
+
+            return [self.tokenize(s) for s in strings]
+
+        :rtype: list(list(str))
+        """
+        return [self.tokenize(s) for s in strings]
+
+    def span_tokenize_sents(self, strings):
+        """
+        Apply ``self.span_tokenize()`` to each element of ``strings``.  I.e.:
+
+            return [self.span_tokenize(s) for s in strings]
+
+        :rtype: iter(list(tuple(int, int)))
+        """
+        for s in strings:
+            yield list(self.span_tokenize(s))
+
+
+class StringTokenizer(TokenizerI):
+    """A tokenizer that divides a string into substrings by splitting
+    on the specified string (defined in subclasses).
+    """
+
+    def tokenize(self, s):
+        return s.split(self._string)
+
+    def span_tokenize(self, s):
+        for span in string_span_tokenize(s, self._string):
+            yield span