Initial commit

2019-10-20 13:16:49 +02:00
commit 233066caf4
2099 changed files with 360824 additions and 0 deletions
--- a/venv/lib/python3.7/site-packages/nltk/tokenize/init.py
+++ b/venv/lib/python3.7/site-packages/nltk/tokenize/init.py
@@ -0,0 +1,147 @@
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Tokenizers
+#
+# Copyright (C) 2001-2019 NLTK Project
+# Author: Edward Loper <edloper@gmail.com>
+#         Steven Bird <stevenbird1@gmail.com> (minor additions)
+# Contributors: matthewmc, clouds56
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+r"""
+NLTK Tokenizer Package
+
+Tokenizers divide strings into lists of substrings.  For example,
+tokenizers can be used to find the words and punctuation in a string:
+
+    >>> from nltk.tokenize import word_tokenize
+    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me
+    ... two of them.\n\nThanks.'''
+    >>> word_tokenize(s)
+    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
+    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+
+This particular tokenizer requires the Punkt sentence tokenization
+models to be installed. NLTK also provides a simpler,
+regular-expression based tokenizer, which splits text on whitespace
+and punctuation:
+
+    >>> from nltk.tokenize import wordpunct_tokenize
+    >>> wordpunct_tokenize(s)
+    ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
+    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
+
+We can also operate at the level of sentences, using the sentence
+tokenizer directly as follows:
+
+    >>> from nltk.tokenize import sent_tokenize, word_tokenize
+    >>> sent_tokenize(s)
+    ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.']
+    >>> [word_tokenize(t) for t in sent_tokenize(s)]
+    [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
+    ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]
+
+Caution: when tokenizing a Unicode string, make sure you are not
+using an encoded version of the string (it may be necessary to
+decode it first, e.g. with ``s.decode("utf8")``.
+
+NLTK tokenizers can produce token-spans, represented as tuples of integers
+having the same semantics as string slices, to support efficient comparison
+of tokenizers.  (These methods are implemented as generators.)
+
+    >>> from nltk.tokenize import WhitespaceTokenizer
+    >>> list(WhitespaceTokenizer().span_tokenize(s))
+    [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
+    (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
+
+There are numerous ways to tokenize text.  If you need more control over
+tokenization, see the other methods provided in this package.
+
+For further information, please see Chapter 3 of the NLTK book.
+"""
+
+import re
+
+from nltk.data import load
+from nltk.tokenize.casual import TweetTokenizer, casual_tokenize
+from nltk.tokenize.mwe import MWETokenizer
+from nltk.tokenize.punkt import PunktSentenceTokenizer
+from nltk.tokenize.regexp import (
+    RegexpTokenizer,
+    WhitespaceTokenizer,
+    BlanklineTokenizer,
+    WordPunctTokenizer,
+    wordpunct_tokenize,
+    regexp_tokenize,
+    blankline_tokenize,
+)
+from nltk.tokenize.repp import ReppTokenizer
+from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize
+from nltk.tokenize.simple import (
+    SpaceTokenizer,
+    TabTokenizer,
+    LineTokenizer,
+    line_tokenize,
+)
+from nltk.tokenize.texttiling import TextTilingTokenizer
+from nltk.tokenize.toktok import ToktokTokenizer
+from nltk.tokenize.treebank import TreebankWordTokenizer
+from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize
+from nltk.tokenize.stanford_segmenter import StanfordSegmenter
+from nltk.tokenize.sonority_sequencing import SyllableTokenizer
+
+
+# Standard sentence tokenizer.
+def sent_tokenize(text, language='english'):
+    """
+    Return a sentence-tokenized copy of *text*,
+    using NLTK's recommended sentence tokenizer
+    (currently :class:`.PunktSentenceTokenizer`
+    for the specified language).
+
+    :param text: text to split into sentences
+    :param language: the model name in the Punkt corpus
+    """
+    tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
+    return tokenizer.tokenize(text)
+
+
+# Standard word tokenizer.
+_treebank_word_tokenizer = TreebankWordTokenizer()
+
+# See discussion on https://github.com/nltk/nltk/pull/1437
+# Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on
+# - chervon quotes u'\xab' and u'\xbb' .
+# - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
+# See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608
+# Also, behavior of splitting on clitics now follows Stanford CoreNLP
+# - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b
+improved_open_quote_regex = re.compile(u'([«“‘„]|[`]+)', re.U)
+improved_open_single_quote_regex = re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d)(\w)\b", re.U)
+improved_close_quote_regex = re.compile(u'([»”’])', re.U)
+improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U)
+_treebank_word_tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 '))
+_treebank_word_tokenizer.STARTING_QUOTES.append((improved_open_single_quote_regex, r'\1 \2'))
+_treebank_word_tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 '))
+_treebank_word_tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 '))
+
+
+def word_tokenize(text, language='english', preserve_line=False):
+    """
+    Return a tokenized copy of *text*,
+    using NLTK's recommended word tokenizer
+    (currently an improved :class:`.TreebankWordTokenizer`
+    along with :class:`.PunktSentenceTokenizer`
+    for the specified language).
+
+    :param text: text to split into words
+    :type text: str
+    :param language: the model name in the Punkt corpus
+    :type language: str
+    :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.
+    :type preserve_line: bool
+    """
+    sentences = [text] if preserve_line else sent_tokenize(text, language)
+    return [
+        token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
+    ]