Initial commit
This commit is contained in:
78
venv/lib/python3.7/site-packages/nltk/tokenize/api.py
Normal file
78
venv/lib/python3.7/site-packages/nltk/tokenize/api.py
Normal file
@@ -0,0 +1,78 @@
|
||||
# Natural Language Toolkit: Tokenizer Interface
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Tokenizer Interface
|
||||
"""
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from six import add_metaclass
|
||||
|
||||
from nltk.internals import overridden
|
||||
from nltk.tokenize.util import string_span_tokenize
|
||||
|
||||
|
||||
@add_metaclass(ABCMeta)
|
||||
class TokenizerI(object):
|
||||
"""
|
||||
A processing interface for tokenizing a string.
|
||||
Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def tokenize(self, s):
|
||||
"""
|
||||
Return a tokenized copy of *s*.
|
||||
|
||||
:rtype: list of str
|
||||
"""
|
||||
if overridden(self.tokenize_sents):
|
||||
return self.tokenize_sents([s])[0]
|
||||
|
||||
def span_tokenize(self, s):
|
||||
"""
|
||||
Identify the tokens using integer offsets ``(start_i, end_i)``,
|
||||
where ``s[start_i:end_i]`` is the corresponding token.
|
||||
|
||||
:rtype: iter(tuple(int, int))
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def tokenize_sents(self, strings):
|
||||
"""
|
||||
Apply ``self.tokenize()`` to each element of ``strings``. I.e.:
|
||||
|
||||
return [self.tokenize(s) for s in strings]
|
||||
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return [self.tokenize(s) for s in strings]
|
||||
|
||||
def span_tokenize_sents(self, strings):
|
||||
"""
|
||||
Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.:
|
||||
|
||||
return [self.span_tokenize(s) for s in strings]
|
||||
|
||||
:rtype: iter(list(tuple(int, int)))
|
||||
"""
|
||||
for s in strings:
|
||||
yield list(self.span_tokenize(s))
|
||||
|
||||
|
||||
class StringTokenizer(TokenizerI):
|
||||
"""A tokenizer that divides a string into substrings by splitting
|
||||
on the specified string (defined in subclasses).
|
||||
"""
|
||||
|
||||
def tokenize(self, s):
|
||||
return s.split(self._string)
|
||||
|
||||
def span_tokenize(self, s):
|
||||
for span in string_span_tokenize(s, self._string):
|
||||
yield span
|
||||
Reference in New Issue
Block a user