Initial commit
This commit is contained in:
120
venv/lib/python3.7/site-packages/nltk/tokenize/mwe.py
Normal file
120
venv/lib/python3.7/site-packages/nltk/tokenize/mwe.py
Normal file
@@ -0,0 +1,120 @@
|
||||
# Multi-Word Expression tokenizer
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Rob Malouf <rmalouf@mail.sdsu.edu>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Multi-Word Expression Tokenizer
|
||||
|
||||
A ``MWETokenizer`` takes a string which has already been divided into tokens and
|
||||
retokenizes it, merging multi-word expressions into single tokens, using a lexicon
|
||||
of MWEs:
|
||||
|
||||
|
||||
>>> from nltk.tokenize import MWETokenizer
|
||||
|
||||
>>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])
|
||||
>>> tokenizer.add_mwe(('in', 'spite', 'of'))
|
||||
|
||||
>>> tokenizer.tokenize('Testing testing testing one two three'.split())
|
||||
['Testing', 'testing', 'testing', 'one', 'two', 'three']
|
||||
|
||||
>>> tokenizer.tokenize('This is a test in spite'.split())
|
||||
['This', 'is', 'a', 'test', 'in', 'spite']
|
||||
|
||||
>>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())
|
||||
['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']
|
||||
|
||||
"""
|
||||
from nltk.util import Trie
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
|
||||
|
||||
class MWETokenizer(TokenizerI):
|
||||
"""A tokenizer that processes tokenized text and merges multi-word expressions
|
||||
into single tokens.
|
||||
"""
|
||||
|
||||
def __init__(self, mwes=None, separator='_'):
|
||||
"""Initialize the multi-word tokenizer with a list of expressions and a
|
||||
separator
|
||||
|
||||
:type mwes: list(list(str))
|
||||
:param mwes: A sequence of multi-word expressions to be merged, where
|
||||
each MWE is a sequence of strings.
|
||||
:type separator: str
|
||||
:param separator: String that should be inserted between words in a multi-word
|
||||
expression token. (Default is '_')
|
||||
|
||||
"""
|
||||
if not mwes:
|
||||
mwes = []
|
||||
self._mwes = Trie(mwes)
|
||||
self._separator = separator
|
||||
|
||||
def add_mwe(self, mwe):
|
||||
"""Add a multi-word expression to the lexicon (stored as a word trie)
|
||||
|
||||
We use ``util.Trie`` to represent the trie. Its form is a dict of dicts.
|
||||
The key True marks the end of a valid MWE.
|
||||
|
||||
:param mwe: The multi-word expression we're adding into the word trie
|
||||
:type mwe: tuple(str) or list(str)
|
||||
|
||||
:Example:
|
||||
|
||||
>>> tokenizer = MWETokenizer()
|
||||
>>> tokenizer.add_mwe(('a', 'b'))
|
||||
>>> tokenizer.add_mwe(('a', 'b', 'c'))
|
||||
>>> tokenizer.add_mwe(('a', 'x'))
|
||||
>>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}
|
||||
>>> tokenizer._mwes == expected
|
||||
True
|
||||
|
||||
"""
|
||||
self._mwes.insert(mwe)
|
||||
|
||||
def tokenize(self, text):
|
||||
"""
|
||||
|
||||
:param text: A list containing tokenized text
|
||||
:type text: list(str)
|
||||
:return: A list of the tokenized text with multi-words merged together
|
||||
:rtype: list(str)
|
||||
|
||||
:Example:
|
||||
|
||||
>>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
|
||||
>>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split())
|
||||
['An', "hors+d'oeuvre", 'tonight,', 'sir?']
|
||||
|
||||
"""
|
||||
i = 0
|
||||
n = len(text)
|
||||
result = []
|
||||
|
||||
while i < n:
|
||||
if text[i] in self._mwes:
|
||||
# possible MWE match
|
||||
j = i
|
||||
trie = self._mwes
|
||||
while j < n and text[j] in trie:
|
||||
trie = trie[text[j]]
|
||||
j = j + 1
|
||||
else:
|
||||
if Trie.LEAF in trie:
|
||||
# success!
|
||||
result.append(self._separator.join(text[i:j]))
|
||||
i = j
|
||||
else:
|
||||
# no match, so backtrack
|
||||
result.append(text[i])
|
||||
i += 1
|
||||
else:
|
||||
result.append(text[i])
|
||||
i += 1
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user