Initial commit
This commit is contained in:
240
venv/lib/python3.7/site-packages/nltk/lm/__init__.py
Normal file
240
venv/lib/python3.7/site-packages/nltk/lm/__init__.py
Normal file
@@ -0,0 +1,240 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Natural Language Toolkit: Language Models
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Authors: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <http://nltk.org/
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
NLTK Language Modeling Module.
|
||||
------------------------------
|
||||
|
||||
Currently this module covers only ngram language models, but it should be easy
|
||||
to extend to neural models.
|
||||
|
||||
|
||||
Preparing Data
|
||||
==============
|
||||
|
||||
Before we train our ngram models it is necessary to make sure the data we put in
|
||||
them is in the right format.
|
||||
Let's say we have a text that is a list of sentences, where each sentence is
|
||||
a list of strings. For simplicity we just consider a text consisting of
|
||||
characters instead of words.
|
||||
|
||||
>>> text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']]
|
||||
|
||||
If we want to train a bigram model, we need to turn this text into bigrams.
|
||||
Here's what the first sentence of our text would look like if we use a function
|
||||
from NLTK for this.
|
||||
|
||||
>>> from nltk.util import bigrams
|
||||
>>> list(bigrams(text[0]))
|
||||
[('a', 'b'), ('b', 'c')]
|
||||
|
||||
Notice how "b" occurs both as the first and second member of different bigrams
|
||||
but "a" and "c" don't? Wouldn't it be nice to somehow indicate how often sentences
|
||||
start with "a" and end with "c"?
|
||||
A standard way to deal with this is to add special "padding" symbols to the
|
||||
sentence before splitting it into ngrams.
|
||||
Fortunately, NLTK also has a function for that, let's see what it does to the
|
||||
first sentence.
|
||||
|
||||
>>> from nltk.util import pad_sequence
|
||||
>>> list(pad_sequence(text[0],
|
||||
... pad_left=True,
|
||||
... left_pad_symbol="<s>",
|
||||
... pad_right=True,
|
||||
... right_pad_symbol="</s>",
|
||||
... n=2))
|
||||
['<s>', 'a', 'b', 'c', '</s>']
|
||||
|
||||
Note the `n` argument, that tells the function we need padding for bigrams.
|
||||
Now, passing all these parameters every time is tedious and in most cases they
|
||||
can be safely assumed as defaults anyway.
|
||||
Thus our module provides a convenience function that has all these arguments
|
||||
already set while the other arguments remain the same as for `pad_sequence`.
|
||||
|
||||
>>> from nltk.lm.preprocessing import pad_both_ends
|
||||
>>> list(pad_both_ends(text[0], n=2))
|
||||
['<s>', 'a', 'b', 'c', '</s>']
|
||||
|
||||
Combining the two parts discussed so far we get the following preparation steps
|
||||
for one sentence.
|
||||
|
||||
>>> list(bigrams(pad_both_ends(text[0], n=2)))
|
||||
[('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]
|
||||
|
||||
To make our model more robust we could also train it on unigrams (single words)
|
||||
as well as bigrams, its main source of information.
|
||||
NLTK once again helpfully provides a function called `everygrams`.
|
||||
While not the most efficient, it is conceptually simple.
|
||||
|
||||
|
||||
>>> from nltk.util import everygrams
|
||||
>>> padded_bigrams = list(pad_both_ends(text[0], n=2))
|
||||
>>> list(everygrams(padded_bigrams, max_len=2))
|
||||
[('<s>',),
|
||||
('a',),
|
||||
('b',),
|
||||
('c',),
|
||||
('</s>',),
|
||||
('<s>', 'a'),
|
||||
('a', 'b'),
|
||||
('b', 'c'),
|
||||
('c', '</s>')]
|
||||
|
||||
We are almost ready to start counting ngrams, just one more step left.
|
||||
During training and evaluation our model will rely on a vocabulary that
|
||||
defines which words are "known" to the model.
|
||||
To create this vocabulary we need to pad our sentences (just like for counting
|
||||
ngrams) and then combine the sentences into one flat stream of words.
|
||||
|
||||
>>> from nltk.lm.preprocessing import flatten
|
||||
>>> list(flatten(pad_both_ends(sent, n=2) for sent in text))
|
||||
['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']
|
||||
|
||||
In most cases we want to use the same text as the source for both vocabulary
|
||||
and ngram counts.
|
||||
Now that we understand what this means for our preprocessing, we can simply import
|
||||
a function that does everything for us.
|
||||
|
||||
>>> from nltk.lm.preprocessing import padded_everygram_pipeline
|
||||
>>> train, vocab = padded_everygram_pipeline(2, text)
|
||||
|
||||
So as to avoid re-creating the text in memory, both `train` and `vocab` are lazy
|
||||
iterators. They are evaluated on demand at training time.
|
||||
|
||||
|
||||
Training
|
||||
========
|
||||
Having prepared our data we are ready to start training a model.
|
||||
As a simple example, let us train a Maximum Likelihood Estimator (MLE).
|
||||
We only need to specify the highest ngram order to instantiate it.
|
||||
|
||||
>>> from nltk.lm import MLE
|
||||
>>> lm = MLE(2)
|
||||
|
||||
This automatically creates an empty vocabulary...
|
||||
|
||||
>>> len(lm.vocab)
|
||||
0
|
||||
|
||||
... which gets filled as we fit the model.
|
||||
|
||||
>>> lm.fit(train, vocab)
|
||||
>>> print(lm.vocab)
|
||||
<Vocabulary with cutoff=1 unk_label='<UNK>' and 9 items>
|
||||
>>> len(lm.vocab)
|
||||
9
|
||||
|
||||
The vocabulary helps us handle words that have not occurred during training.
|
||||
|
||||
>>> lm.vocab.lookup(text[0])
|
||||
('a', 'b', 'c')
|
||||
>>> lm.vocab.lookup(["aliens", "from", "Mars"])
|
||||
('<UNK>', '<UNK>', '<UNK>')
|
||||
|
||||
Moreover, in some cases we want to ignore words that we did see during training
|
||||
but that didn't occur frequently enough, to provide us useful information.
|
||||
You can tell the vocabulary to ignore such words.
|
||||
To find out how that works, check out the docs for the `Vocabulary` class.
|
||||
|
||||
|
||||
Using a Trained Model
|
||||
=====================
|
||||
When it comes to ngram models the training boils down to counting up the ngrams
|
||||
from the training corpus.
|
||||
|
||||
>>> print(lm.counts)
|
||||
<NgramCounter with 2 ngram orders and 24 ngrams>
|
||||
|
||||
This provides a convenient interface to access counts for unigrams...
|
||||
|
||||
>>> lm.counts['a']
|
||||
2
|
||||
|
||||
...and bigrams (in this case "a b")
|
||||
|
||||
>>> lm.counts[['a']]['b']
|
||||
1
|
||||
|
||||
And so on. However, the real purpose of training a language model is to have it
|
||||
score how probable words are in certain contexts.
|
||||
This being MLE, the model returns the item's relative frequency as its score.
|
||||
|
||||
>>> lm.score("a")
|
||||
0.15384615384615385
|
||||
|
||||
Items that are not seen during training are mapped to the vocabulary's
|
||||
"unknown label" token. This is "<UNK>" by default.
|
||||
|
||||
>>> lm.score("<UNK>") == lm.score("aliens")
|
||||
True
|
||||
|
||||
Here's how you get the score for a word given some preceding context.
|
||||
For example we want to know what is the chance that "b" is preceded by "a".
|
||||
|
||||
>>> lm.score("b", ["a"])
|
||||
0.5
|
||||
|
||||
To avoid underflow when working with many small score values it makes sense to
|
||||
take their logarithm.
|
||||
For convenience this can be done with the `logscore` method.
|
||||
|
||||
>>> lm.logscore("a")
|
||||
-2.700439718141092
|
||||
|
||||
Building on this method, we can also evaluate our model's cross-entropy and
|
||||
perplexity with respect to sequences of ngrams.
|
||||
|
||||
>>> test = [('a', 'b'), ('c', 'd')]
|
||||
>>> lm.entropy(test)
|
||||
1.292481250360578
|
||||
>>> lm.perplexity(test)
|
||||
2.449489742783178
|
||||
|
||||
It is advisable to preprocess your test text exactly the same way as you did
|
||||
the training text.
|
||||
|
||||
One cool feature of ngram models is that they can be used to generate text.
|
||||
|
||||
>>> lm.generate(1, random_seed=3)
|
||||
'<s>'
|
||||
>>> lm.generate(5, random_seed=3)
|
||||
['<s>', 'a', 'b', 'c', 'd']
|
||||
|
||||
Provide `random_seed` if you want to consistently reproduce the same text all
|
||||
other things being equal. Here we are using it to test the examples.
|
||||
|
||||
You can also condition your generation on some preceding text with the `context`
|
||||
argument.
|
||||
|
||||
>>> lm.generate(5, text_seed=['c'], random_seed=3)
|
||||
['</s>', 'c', 'd', 'c', 'd']
|
||||
|
||||
Note that an ngram model is restricted in how much preceding context it can
|
||||
take into account. For example, a trigram model can only condition its output
|
||||
on 2 preceding words. If you pass in a 4-word context, the first two words
|
||||
will be ignored.
|
||||
"""
|
||||
|
||||
from nltk.lm.models import (
|
||||
MLE,
|
||||
Lidstone,
|
||||
Laplace,
|
||||
WittenBellInterpolated,
|
||||
KneserNeyInterpolated,
|
||||
)
|
||||
from nltk.lm.counter import NgramCounter
|
||||
from nltk.lm.vocabulary import Vocabulary
|
||||
|
||||
__all__ = [
|
||||
"Vocabulary",
|
||||
"NgramCounter",
|
||||
"MLE",
|
||||
"Lidstone",
|
||||
"Laplace",
|
||||
"WittenBellInterpolated",
|
||||
"KneserNeyInterpolated",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
252
venv/lib/python3.7/site-packages/nltk/lm/api.py
Normal file
252
venv/lib/python3.7/site-packages/nltk/lm/api.py
Normal file
@@ -0,0 +1,252 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Natural Language Toolkit: Language Models
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Authors: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""Language Model Interface."""
|
||||
from __future__ import division, unicode_literals
|
||||
|
||||
import random
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from bisect import bisect
|
||||
|
||||
from six import add_metaclass
|
||||
|
||||
from nltk.lm.counter import NgramCounter
|
||||
from nltk.lm.util import log_base2
|
||||
from nltk.lm.vocabulary import Vocabulary
|
||||
|
||||
try:
|
||||
from itertools import accumulate
|
||||
except ImportError:
|
||||
import operator
|
||||
|
||||
def accumulate(iterable, func=operator.add):
|
||||
"""Return running totals"""
|
||||
# accumulate([1,2,3,4,5]) --> 1 3 6 10 15
|
||||
# accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
|
||||
it = iter(iterable)
|
||||
try:
|
||||
total = next(it)
|
||||
except StopIteration:
|
||||
return
|
||||
yield total
|
||||
for element in it:
|
||||
total = func(total, element)
|
||||
yield total
|
||||
|
||||
|
||||
@add_metaclass(ABCMeta)
|
||||
class Smoothing(object):
|
||||
"""Ngram Smoothing Interface
|
||||
|
||||
Implements Chen & Goodman 1995's idea that all smoothing algorithms have
|
||||
certain features in common. This should ideally allow smoothing algoritms to
|
||||
work both with Backoff and Interpolation.
|
||||
"""
|
||||
|
||||
def __init__(self, vocabulary, counter):
|
||||
"""
|
||||
:param vocabulary: The Ngram vocabulary object.
|
||||
:type vocabulary: nltk.lm.vocab.Vocabulary
|
||||
:param counter: The counts of the vocabulary items.
|
||||
:type counter: nltk.lm.counter.NgramCounter
|
||||
"""
|
||||
self.vocab = vocabulary
|
||||
self.counts = counter
|
||||
|
||||
@abstractmethod
|
||||
def unigram_score(self, word):
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def alpha_gamma(self, word, context):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def _mean(items):
|
||||
"""Return average (aka mean) for sequence of items."""
|
||||
return sum(items) / len(items)
|
||||
|
||||
|
||||
def _random_generator(seed_or_generator):
|
||||
if isinstance(seed_or_generator, random.Random):
|
||||
return seed_or_generator
|
||||
return random.Random(seed_or_generator)
|
||||
|
||||
|
||||
def _weighted_choice(population, weights, random_generator=None):
|
||||
"""Like random.choice, but with weights.
|
||||
|
||||
Heavily inspired by python 3.6 `random.choices`.
|
||||
"""
|
||||
if not population:
|
||||
raise ValueError("Can't choose from empty population")
|
||||
if len(population) != len(weights):
|
||||
raise ValueError("The number of weights does not match the population")
|
||||
cum_weights = list(accumulate(weights))
|
||||
total = cum_weights[-1]
|
||||
threshold = random_generator.random()
|
||||
return population[bisect(cum_weights, total * threshold)]
|
||||
|
||||
|
||||
@add_metaclass(ABCMeta)
|
||||
class LanguageModel(object):
|
||||
"""ABC for Language Models.
|
||||
|
||||
Cannot be directly instantiated itself.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, order, vocabulary=None, counter=None):
|
||||
"""Creates new LanguageModel.
|
||||
|
||||
:param vocabulary: If provided, this vocabulary will be used instead
|
||||
of creating a new one when training.
|
||||
:type vocabulary: `nltk.lm.Vocabulary` or None
|
||||
:param counter: If provided, use this object to count ngrams.
|
||||
:type vocabulary: `nltk.lm.NgramCounter` or None
|
||||
:param ngrams_fn: If given, defines how sentences in training text are turned to ngram
|
||||
sequences.
|
||||
:type ngrams_fn: function or None
|
||||
:param pad_fn: If given, defines how senteces in training text are padded.
|
||||
:type pad_fn: function or None
|
||||
|
||||
"""
|
||||
self.order = order
|
||||
self.vocab = Vocabulary() if vocabulary is None else vocabulary
|
||||
self.counts = NgramCounter() if counter is None else counter
|
||||
|
||||
def fit(self, text, vocabulary_text=None):
|
||||
"""Trains the model on a text.
|
||||
|
||||
:param text: Training text as a sequence of sentences.
|
||||
|
||||
"""
|
||||
if not self.vocab:
|
||||
if vocabulary_text is None:
|
||||
raise ValueError(
|
||||
"Cannot fit without a vocabulary or text to " "create it from."
|
||||
)
|
||||
self.vocab.update(vocabulary_text)
|
||||
self.counts.update(self.vocab.lookup(sent) for sent in text)
|
||||
|
||||
def score(self, word, context=None):
|
||||
"""Masks out of vocab (OOV) words and computes their model score.
|
||||
|
||||
For model-specific logic of calculating scores, see the `unmasked_score`
|
||||
method.
|
||||
"""
|
||||
return self.unmasked_score(
|
||||
self.vocab.lookup(word), self.vocab.lookup(context) if context else None
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def unmasked_score(self, word, context=None):
|
||||
"""Score a word given some optional context.
|
||||
|
||||
Concrete models are expected to provide an implementation.
|
||||
Note that this method does not mask its arguments with the OOV label.
|
||||
Use the `score` method for that.
|
||||
|
||||
:param str word: Word for which we want the score
|
||||
:param tuple(str) context: Context the word is in.
|
||||
If `None`, compute unigram score.
|
||||
:param context: tuple(str) or None
|
||||
:rtype: float
|
||||
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def logscore(self, word, context=None):
|
||||
"""Evaluate the log score of this word in this context.
|
||||
|
||||
The arguments are the same as for `score` and `unmasked_score`.
|
||||
|
||||
"""
|
||||
return log_base2(self.score(word, context))
|
||||
|
||||
def context_counts(self, context):
|
||||
"""Helper method for retrieving counts for a given context.
|
||||
|
||||
Assumes context has been checked and oov words in it masked.
|
||||
:type context: tuple(str) or None
|
||||
|
||||
"""
|
||||
return (
|
||||
self.counts[len(context) + 1][context] if context else self.counts.unigrams
|
||||
)
|
||||
|
||||
def entropy(self, text_ngrams):
|
||||
"""Calculate cross-entropy of model for given evaluation text.
|
||||
|
||||
:param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples.
|
||||
:rtype: float
|
||||
|
||||
"""
|
||||
return -1 * _mean(
|
||||
[self.logscore(ngram[-1], ngram[:-1]) for ngram in text_ngrams]
|
||||
)
|
||||
|
||||
def perplexity(self, text_ngrams):
|
||||
"""Calculates the perplexity of the given text.
|
||||
|
||||
This is simply 2 ** cross-entropy for the text, so the arguments are the same.
|
||||
|
||||
"""
|
||||
return pow(2.0, self.entropy(text_ngrams))
|
||||
|
||||
def generate(self, num_words=1, text_seed=None, random_seed=None):
|
||||
"""Generate words from the model.
|
||||
|
||||
:param int num_words: How many words to generate. By default 1.
|
||||
:param text_seed: Generation can be conditioned on preceding context.
|
||||
:param random_seed: A random seed or an instance of `random.Random`. If provided,
|
||||
makes the random sampling part of generation reproducible.
|
||||
:return: One (str) word or a list of words generated from model.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> from nltk.lm import MLE
|
||||
>>> lm = MLE(2)
|
||||
>>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c'])
|
||||
>>> lm.fit([[("a",), ("b",), ("c",)]])
|
||||
>>> lm.generate(random_seed=3)
|
||||
'a'
|
||||
>>> lm.generate(text_seed=['a'])
|
||||
'b'
|
||||
|
||||
"""
|
||||
text_seed = [] if text_seed is None else list(text_seed)
|
||||
random_generator = _random_generator(random_seed)
|
||||
# base recursion case
|
||||
if num_words == 1:
|
||||
context = (
|
||||
text_seed[-self.order + 1 :]
|
||||
if len(text_seed) >= self.order
|
||||
else text_seed
|
||||
)
|
||||
samples = self.context_counts(self.vocab.lookup(context))
|
||||
while context and not samples:
|
||||
context = context[1:] if len(context) > 1 else []
|
||||
samples = self.context_counts(self.vocab.lookup(context))
|
||||
# sorting achieves two things:
|
||||
# - reproducible randomness when sampling
|
||||
# - turning Mapping into Sequence which _weighted_choice expects
|
||||
samples = sorted(samples)
|
||||
return _weighted_choice(
|
||||
samples, tuple(self.score(w, context) for w in samples), random_generator
|
||||
)
|
||||
# build up text one word at a time
|
||||
generated = []
|
||||
for _ in range(num_words):
|
||||
generated.append(
|
||||
self.generate(
|
||||
num_words=1,
|
||||
text_seed=text_seed + generated,
|
||||
random_seed=random_generator,
|
||||
)
|
||||
)
|
||||
return generated
|
||||
168
venv/lib/python3.7/site-packages/nltk/lm/counter.py
Normal file
168
venv/lib/python3.7/site-packages/nltk/lm/counter.py
Normal file
@@ -0,0 +1,168 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Natural Language Toolkit
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
Language Model Counter
|
||||
----------------------
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from collections import Sequence, defaultdict
|
||||
|
||||
from six import string_types
|
||||
from nltk import compat
|
||||
from nltk.probability import ConditionalFreqDist, FreqDist
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class NgramCounter(object):
|
||||
"""Class for counting ngrams.
|
||||
|
||||
Will count any ngram sequence you give it ;)
|
||||
|
||||
First we need to make sure we are feeding the counter sentences of ngrams.
|
||||
|
||||
>>> text = [["a", "b", "c", "d"], ["a", "c", "d", "c"]]
|
||||
>>> from nltk.util import ngrams
|
||||
>>> text_bigrams = [ngrams(sent, 2) for sent in text]
|
||||
>>> text_unigrams = [ngrams(sent, 1) for sent in text]
|
||||
|
||||
The counting itself is very simple.
|
||||
|
||||
>>> from nltk.lm import NgramCounter
|
||||
>>> ngram_counts = NgramCounter(text_bigrams + text_unigrams)
|
||||
|
||||
You can conveniently access ngram counts using standard python dictionary notation.
|
||||
String keys will give you unigram counts.
|
||||
|
||||
>>> ngram_counts['a']
|
||||
2
|
||||
>>> ngram_counts['aliens']
|
||||
0
|
||||
|
||||
If you want to access counts for higher order ngrams, use a list or a tuple.
|
||||
These are treated as "context" keys, so what you get is a frequency distribution
|
||||
over all continuations after the given context.
|
||||
|
||||
>>> sorted(ngram_counts[['a']].items())
|
||||
[('b', 1), ('c', 1)]
|
||||
>>> sorted(ngram_counts[('a',)].items())
|
||||
[('b', 1), ('c', 1)]
|
||||
|
||||
This is equivalent to specifying explicitly the order of the ngram (in this case
|
||||
2 for bigram) and indexing on the context.
|
||||
>>> ngram_counts[2][('a',)] is ngram_counts[['a']]
|
||||
True
|
||||
|
||||
Note that the keys in `ConditionalFreqDist` cannot be lists, only tuples!
|
||||
It is generally advisable to use the less verbose and more flexible square
|
||||
bracket notation.
|
||||
|
||||
To get the count of the full ngram "a b", do this:
|
||||
|
||||
>>> ngram_counts[['a']]['b']
|
||||
1
|
||||
|
||||
Specifying the ngram order as a number can be useful for accessing all ngrams
|
||||
in that order.
|
||||
|
||||
>>> ngram_counts[2]
|
||||
<ConditionalFreqDist with 4 conditions>
|
||||
|
||||
The keys of this `ConditionalFreqDist` are the contexts we discussed earlier.
|
||||
Unigrams can also be accessed with a human-friendly alias.
|
||||
|
||||
>>> ngram_counts.unigrams is ngram_counts[1]
|
||||
True
|
||||
|
||||
Similarly to `collections.Counter`, you can update counts after initialization.
|
||||
|
||||
>>> ngram_counts['e']
|
||||
0
|
||||
>>> ngram_counts.update([ngrams(["d", "e", "f"], 1)])
|
||||
>>> ngram_counts['e']
|
||||
1
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, ngram_text=None):
|
||||
"""Creates a new NgramCounter.
|
||||
|
||||
If `ngram_text` is specified, counts ngrams from it, otherwise waits for
|
||||
`update` method to be called explicitly.
|
||||
|
||||
:param ngram_text: Optional text containing senteces of ngrams, as for `update` method.
|
||||
:type ngram_text: Iterable(Iterable(tuple(str))) or None
|
||||
|
||||
"""
|
||||
self._counts = defaultdict(ConditionalFreqDist)
|
||||
self._counts[1] = self.unigrams = FreqDist()
|
||||
|
||||
if ngram_text:
|
||||
self.update(ngram_text)
|
||||
|
||||
def update(self, ngram_text):
|
||||
"""Updates ngram counts from `ngram_text`.
|
||||
|
||||
Expects `ngram_text` to be a sequence of sentences (sequences).
|
||||
Each sentence consists of ngrams as tuples of strings.
|
||||
|
||||
:param Iterable(Iterable(tuple(str))) ngram_text: Text containing senteces of ngrams.
|
||||
:raises TypeError: if the ngrams are not tuples.
|
||||
|
||||
"""
|
||||
|
||||
for sent in ngram_text:
|
||||
for ngram in sent:
|
||||
if not isinstance(ngram, tuple):
|
||||
raise TypeError(
|
||||
"Ngram <{0}> isn't a tuple, "
|
||||
"but {1}".format(ngram, type(ngram))
|
||||
)
|
||||
|
||||
ngram_order = len(ngram)
|
||||
if ngram_order == 1:
|
||||
self.unigrams[ngram[0]] += 1
|
||||
continue
|
||||
|
||||
context, word = ngram[:-1], ngram[-1]
|
||||
self[ngram_order][context][word] += 1
|
||||
|
||||
def N(self):
|
||||
"""Returns grand total number of ngrams stored.
|
||||
|
||||
This includes ngrams from all orders, so some duplication is expected.
|
||||
:rtype: int
|
||||
|
||||
>>> from nltk.lm import NgramCounter
|
||||
>>> counts = NgramCounter([[("a", "b"), ("c",), ("d", "e")]])
|
||||
>>> counts.N()
|
||||
3
|
||||
|
||||
"""
|
||||
return sum(val.N() for val in self._counts.values())
|
||||
|
||||
def __getitem__(self, item):
|
||||
"""User-friendly access to ngram counts."""
|
||||
if isinstance(item, int):
|
||||
return self._counts[item]
|
||||
elif isinstance(item, string_types):
|
||||
return self._counts.__getitem__(1)[item]
|
||||
elif isinstance(item, Sequence):
|
||||
return self._counts.__getitem__(len(item) + 1)[tuple(item)]
|
||||
|
||||
def __str__(self):
|
||||
return "<{0} with {1} ngram orders and {2} ngrams>".format(
|
||||
self.__class__.__name__, len(self._counts), self.N()
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
return self._counts.__len__()
|
||||
|
||||
def __contains__(self, item):
|
||||
return item in self._counts
|
||||
100
venv/lib/python3.7/site-packages/nltk/lm/models.py
Normal file
100
venv/lib/python3.7/site-packages/nltk/lm/models.py
Normal file
@@ -0,0 +1,100 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Natural Language Toolkit: Language Models
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""Language Models"""
|
||||
from __future__ import division, unicode_literals
|
||||
|
||||
from nltk import compat
|
||||
from nltk.lm.api import LanguageModel, Smoothing
|
||||
from nltk.lm.smoothing import KneserNey, WittenBell
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class MLE(LanguageModel):
|
||||
"""Class for providing MLE ngram model scores.
|
||||
|
||||
Inherits initialization from BaseNgramModel.
|
||||
"""
|
||||
|
||||
def unmasked_score(self, word, context=None):
|
||||
"""Returns the MLE score for a word given a context.
|
||||
|
||||
Args:
|
||||
- word is expcected to be a string
|
||||
- context is expected to be something reasonably convertible to a tuple
|
||||
"""
|
||||
return self.context_counts(context).freq(word)
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class Lidstone(LanguageModel):
|
||||
"""Provides Lidstone-smoothed scores.
|
||||
|
||||
In addition to initialization arguments from BaseNgramModel also requires
|
||||
a number by which to increase the counts, gamma.
|
||||
"""
|
||||
|
||||
def __init__(self, gamma, *args, **kwargs):
|
||||
super(Lidstone, self).__init__(*args, **kwargs)
|
||||
self.gamma = gamma
|
||||
|
||||
def unmasked_score(self, word, context=None):
|
||||
"""Add-one smoothing: Lidstone or Laplace.
|
||||
|
||||
To see what kind, look at `gamma` attribute on the class.
|
||||
|
||||
"""
|
||||
counts = self.context_counts(context)
|
||||
word_count = counts[word]
|
||||
norm_count = counts.N()
|
||||
return (word_count + self.gamma) / (norm_count + len(self.vocab) * self.gamma)
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class Laplace(Lidstone):
|
||||
"""Implements Laplace (add one) smoothing.
|
||||
|
||||
Initialization identical to BaseNgramModel because gamma is always 1.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(Laplace, self).__init__(1, *args, **kwargs)
|
||||
|
||||
|
||||
class InterpolatedLanguageModel(LanguageModel):
|
||||
"""Logic common to all interpolated language models.
|
||||
|
||||
The idea to abstract this comes from Chen & Goodman 1995.
|
||||
"""
|
||||
|
||||
def __init__(self, smoothing_cls, order, **kwargs):
|
||||
assert issubclass(smoothing_cls, Smoothing)
|
||||
params = kwargs.pop("params", {})
|
||||
super(InterpolatedLanguageModel, self).__init__(order, **kwargs)
|
||||
self.estimator = smoothing_cls(self.vocab, self.counts, **params)
|
||||
|
||||
def unmasked_score(self, word, context=None):
|
||||
if not context:
|
||||
return self.estimator.unigram_score(word)
|
||||
alpha, gamma = self.estimator.alpha_gamma(word, context)
|
||||
return alpha + gamma * self.unmasked_score(word, context[1:])
|
||||
|
||||
|
||||
class WittenBellInterpolated(InterpolatedLanguageModel):
|
||||
"""Interpolated version of Witten-Bell smoothing."""
|
||||
|
||||
def __init__(self, order, **kwargs):
|
||||
super(WittenBellInterpolated, self).__init__(WittenBell, order, **kwargs)
|
||||
|
||||
|
||||
class KneserNeyInterpolated(InterpolatedLanguageModel):
|
||||
"""Interpolated version of Kneser-Ney smoothing."""
|
||||
|
||||
def __init__(self, order, discount=0.1, **kwargs):
|
||||
super(KneserNeyInterpolated, self).__init__(
|
||||
KneserNey, order, params={"discount": discount}, **kwargs
|
||||
)
|
||||
51
venv/lib/python3.7/site-packages/nltk/lm/preprocessing.py
Normal file
51
venv/lib/python3.7/site-packages/nltk/lm/preprocessing.py
Normal file
@@ -0,0 +1,51 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Natural Language Toolkit: Language Model Unit Tests
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
from functools import partial
|
||||
from itertools import chain
|
||||
|
||||
from nltk.util import everygrams, pad_sequence
|
||||
|
||||
flatten = chain.from_iterable
|
||||
pad_both_ends = partial(
|
||||
pad_sequence,
|
||||
pad_left=True,
|
||||
left_pad_symbol="<s>",
|
||||
pad_right=True,
|
||||
right_pad_symbol="</s>",
|
||||
)
|
||||
pad_both_ends.__doc__ = """Pads both ends of a sentence to length specified by ngram order.
|
||||
|
||||
Following convention <s> pads the start of sentence </s> pads its end.
|
||||
"""
|
||||
|
||||
|
||||
def padded_everygrams(order, sentence):
|
||||
"""Helper with some useful defaults.
|
||||
|
||||
Applies pad_both_ends to sentence and follows it up with everygrams.
|
||||
"""
|
||||
return everygrams(list(pad_both_ends(sentence, n=order)), max_len=order)
|
||||
|
||||
|
||||
def padded_everygram_pipeline(order, text):
|
||||
"""Default preprocessing for a sequence of sentences.
|
||||
|
||||
Creates two iterators:
|
||||
- sentences padded and turned into sequences of `nltk.util.everygrams`
|
||||
- sentences padded as above and chained together for a flat stream of words
|
||||
|
||||
:param order: Largest ngram length produced by `everygrams`.
|
||||
:param text: Text to iterate over. Expected to be an iterable of sentences:
|
||||
Iterable[Iterable[str]]
|
||||
:return: iterator over text as ngrams, iterator over text as vocabulary data
|
||||
"""
|
||||
padding_fn = partial(pad_both_ends, n=order)
|
||||
return (
|
||||
(everygrams(list(padding_fn(sent)), max_len=order) for sent in text),
|
||||
flatten(map(padding_fn, text)),
|
||||
)
|
||||
61
venv/lib/python3.7/site-packages/nltk/lm/smoothing.py
Normal file
61
venv/lib/python3.7/site-packages/nltk/lm/smoothing.py
Normal file
@@ -0,0 +1,61 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Natural Language Toolkit: Language Model Unit Tests
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""Smoothing algorithms for language modeling.
|
||||
|
||||
According to Chen & Goodman 1995 these should work with both Backoff and
|
||||
Interpolation.
|
||||
"""
|
||||
|
||||
from nltk.lm.api import Smoothing
|
||||
|
||||
|
||||
def _count_non_zero_vals(dictionary):
|
||||
return sum(1.0 for c in dictionary.values() if c > 0)
|
||||
|
||||
|
||||
class WittenBell(Smoothing):
|
||||
"""Witten-Bell smoothing."""
|
||||
|
||||
def __init__(self, vocabulary, counter, discount=0.1, **kwargs):
|
||||
super(WittenBell, self).__init__(vocabulary, counter, *kwargs)
|
||||
|
||||
def alpha_gamma(self, word, context):
|
||||
gamma = self.gamma(context)
|
||||
return (1.0 - gamma) * self.alpha(word, context), gamma
|
||||
|
||||
def unigram_score(self, word):
|
||||
return self.counts.unigrams.freq(word)
|
||||
|
||||
def alpha(self, word, context):
|
||||
return self.counts[context].freq(word)
|
||||
|
||||
def gamma(self, context):
|
||||
n_plus = _count_non_zero_vals(self.counts[context])
|
||||
return n_plus / (n_plus + self.counts[len(context) + 1].N())
|
||||
|
||||
|
||||
class KneserNey(Smoothing):
|
||||
"""Kneser-Ney Smoothing."""
|
||||
|
||||
def __init__(self, vocabulary, counter, discount=0.1, **kwargs):
|
||||
super(KneserNey, self).__init__(vocabulary, counter, *kwargs)
|
||||
self.discount = discount
|
||||
|
||||
def unigram_score(self, word):
|
||||
return 1.0 / len(self.vocab)
|
||||
|
||||
def alpha_gamma(self, word, context):
|
||||
prefix_counts = self.counts[context]
|
||||
return self.alpha(word, prefix_counts), self.gamma(prefix_counts)
|
||||
|
||||
def alpha(self, word, prefix_counts):
|
||||
return max(prefix_counts[word] - self.discount, 0.0) / prefix_counts.N()
|
||||
|
||||
def gamma(self, prefix_counts):
|
||||
return self.discount * _count_non_zero_vals(prefix_counts) / prefix_counts.N()
|
||||
|
||||
20
venv/lib/python3.7/site-packages/nltk/lm/util.py
Normal file
20
venv/lib/python3.7/site-packages/nltk/lm/util.py
Normal file
@@ -0,0 +1,20 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Natural Language Toolkit
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""Language Model Utilities"""
|
||||
|
||||
from math import log
|
||||
|
||||
NEG_INF = float("-inf")
|
||||
POS_INF = float("inf")
|
||||
|
||||
|
||||
def log_base2(score):
|
||||
"""Convenience function for computing logarithms with base 2."""
|
||||
if score == 0.0:
|
||||
return NEG_INF
|
||||
return log(score, 2)
|
||||
248
venv/lib/python3.7/site-packages/nltk/lm/vocabulary.py
Normal file
248
venv/lib/python3.7/site-packages/nltk/lm/vocabulary.py
Normal file
@@ -0,0 +1,248 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Natural Language Toolkit
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""Language Model Vocabulary"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import sys
|
||||
from collections import Counter, Iterable
|
||||
from itertools import chain
|
||||
|
||||
from nltk import compat
|
||||
|
||||
try:
|
||||
# Python >= 3.4
|
||||
from functools import singledispatch
|
||||
except ImportError:
|
||||
# Python < 3.4
|
||||
from singledispatch import singledispatch
|
||||
|
||||
|
||||
@singledispatch
|
||||
def _dispatched_lookup(words, vocab):
|
||||
raise TypeError(
|
||||
"Unsupported type for looking up in vocabulary: {0}".format(type(words))
|
||||
)
|
||||
|
||||
|
||||
@_dispatched_lookup.register(Iterable)
|
||||
def _(words, vocab):
|
||||
"""Look up a sequence of words in the vocabulary.
|
||||
|
||||
Returns an iterator over looked up words.
|
||||
|
||||
"""
|
||||
return tuple(_dispatched_lookup(w, vocab) for w in words)
|
||||
|
||||
|
||||
try:
|
||||
# Python 2 unicode + str type
|
||||
basestring
|
||||
except NameError:
|
||||
# Python 3 unicode + str type
|
||||
basestring = str
|
||||
|
||||
|
||||
@_dispatched_lookup.register(basestring)
|
||||
def _string_lookup(word, vocab):
|
||||
"""Looks up one word in the vocabulary."""
|
||||
return word if word in vocab else vocab.unk_label
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class Vocabulary(object):
|
||||
"""Stores language model vocabulary.
|
||||
|
||||
Satisfies two common language modeling requirements for a vocabulary:
|
||||
- When checking membership and calculating its size, filters items
|
||||
by comparing their counts to a cutoff value.
|
||||
- Adds a special "unknown" token which unseen words are mapped to.
|
||||
|
||||
>>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd']
|
||||
>>> from nltk.lm import Vocabulary
|
||||
>>> vocab = Vocabulary(words, unk_cutoff=2)
|
||||
|
||||
Tokens with counts greater than or equal to the cutoff value will
|
||||
be considered part of the vocabulary.
|
||||
|
||||
>>> vocab['c']
|
||||
3
|
||||
>>> 'c' in vocab
|
||||
True
|
||||
>>> vocab['d']
|
||||
2
|
||||
>>> 'd' in vocab
|
||||
True
|
||||
|
||||
Tokens with frequency counts less than the cutoff value will be considered not
|
||||
part of the vocabulary even though their entries in the count dictionary are
|
||||
preserved.
|
||||
|
||||
>>> vocab['b']
|
||||
1
|
||||
>>> 'b' in vocab
|
||||
False
|
||||
>>> vocab['aliens']
|
||||
0
|
||||
>>> 'aliens' in vocab
|
||||
False
|
||||
|
||||
Keeping the count entries for seen words allows us to change the cutoff value
|
||||
without having to recalculate the counts.
|
||||
|
||||
>>> vocab2 = Vocabulary(vocab.counts, unk_cutoff=1)
|
||||
>>> "b" in vocab2
|
||||
True
|
||||
|
||||
The cutoff value influences not only membership checking but also the result of
|
||||
getting the size of the vocabulary using the built-in `len`.
|
||||
Note that while the number of keys in the vocabulary's counter stays the same,
|
||||
the items in the vocabulary differ depending on the cutoff.
|
||||
We use `sorted` to demonstrate because it keeps the order consistent.
|
||||
|
||||
>>> sorted(vocab2.counts)
|
||||
['-', 'a', 'b', 'c', 'd', 'r']
|
||||
>>> sorted(vocab2)
|
||||
['-', '<UNK>', 'a', 'b', 'c', 'd', 'r']
|
||||
>>> sorted(vocab.counts)
|
||||
['-', 'a', 'b', 'c', 'd', 'r']
|
||||
>>> sorted(vocab)
|
||||
['<UNK>', 'a', 'c', 'd']
|
||||
|
||||
In addition to items it gets populated with, the vocabulary stores a special
|
||||
token that stands in for so-called "unknown" items. By default it's "<UNK>".
|
||||
|
||||
>>> "<UNK>" in vocab
|
||||
True
|
||||
|
||||
We can look up words in a vocabulary using its `lookup` method.
|
||||
"Unseen" words (with counts less than cutoff) are looked up as the unknown label.
|
||||
If given one word (a string) as an input, this method will return a string.
|
||||
|
||||
>>> vocab.lookup("a")
|
||||
'a'
|
||||
>>> vocab.lookup("aliens")
|
||||
'<UNK>'
|
||||
|
||||
If given a sequence, it will return an tuple of the looked up words.
|
||||
|
||||
>>> vocab.lookup(["p", 'a', 'r', 'd', 'b', 'c'])
|
||||
('<UNK>', 'a', '<UNK>', 'd', '<UNK>', 'c')
|
||||
|
||||
It's possible to update the counts after the vocabulary has been created.
|
||||
The interface follows that of `collections.Counter`.
|
||||
|
||||
>>> vocab['b']
|
||||
1
|
||||
>>> vocab.update(["b", "b", "c"])
|
||||
>>> vocab['b']
|
||||
3
|
||||
"""
|
||||
|
||||
def __init__(self, counts=None, unk_cutoff=1, unk_label="<UNK>"):
|
||||
"""Create a new Vocabulary.
|
||||
|
||||
:param counts: Optional iterable or `collections.Counter` instance to
|
||||
pre-seed the Vocabulary. In case it is iterable, counts
|
||||
are calculated.
|
||||
:param int unk_cutoff: Words that occur less frequently than this value
|
||||
are not considered part of the vocabulary.
|
||||
:param unk_label: Label for marking words not part of vocabulary.
|
||||
|
||||
"""
|
||||
if isinstance(counts, Counter):
|
||||
self.counts = counts
|
||||
else:
|
||||
self.counts = Counter()
|
||||
if isinstance(counts, Iterable):
|
||||
self.counts.update(counts)
|
||||
self.unk_label = unk_label
|
||||
if unk_cutoff < 1:
|
||||
raise ValueError(
|
||||
"Cutoff value cannot be less than 1. Got: {0}".format(unk_cutoff)
|
||||
)
|
||||
self._cutoff = unk_cutoff
|
||||
|
||||
@property
|
||||
def cutoff(self):
|
||||
"""Cutoff value.
|
||||
|
||||
Items with count below this value are not considered part of vocabulary.
|
||||
|
||||
"""
|
||||
return self._cutoff
|
||||
|
||||
def update(self, *counter_args, **counter_kwargs):
|
||||
"""Update vocabulary counts.
|
||||
|
||||
Wraps `collections.Counter.update` method.
|
||||
|
||||
"""
|
||||
self.counts.update(*counter_args, **counter_kwargs)
|
||||
|
||||
def lookup(self, words):
|
||||
"""Look up one or more words in the vocabulary.
|
||||
|
||||
If passed one word as a string will return that word or `self.unk_label`.
|
||||
Otherwise will assume it was passed a sequence of words, will try to look
|
||||
each of them up and return an iterator over the looked up words.
|
||||
|
||||
:param words: Word(s) to look up.
|
||||
:type words: Iterable(str) or str
|
||||
:rtype: generator(str) or str
|
||||
:raises: TypeError for types other than strings or iterables
|
||||
|
||||
>>> from nltk.lm import Vocabulary
|
||||
>>> vocab = Vocabulary(["a", "b", "c", "a", "b"], unk_cutoff=2)
|
||||
>>> vocab.lookup("a")
|
||||
'a'
|
||||
>>> vocab.lookup("aliens")
|
||||
'<UNK>'
|
||||
>>> vocab.lookup(["a", "b", "c", ["x", "b"]])
|
||||
('a', 'b', '<UNK>', ('<UNK>', 'b'))
|
||||
|
||||
"""
|
||||
return _dispatched_lookup(words, self)
|
||||
|
||||
def __getitem__(self, item):
|
||||
return self._cutoff if item == self.unk_label else self.counts[item]
|
||||
|
||||
def __contains__(self, item):
|
||||
"""Only consider items with counts GE to cutoff as being in the
|
||||
vocabulary."""
|
||||
return self[item] >= self.cutoff
|
||||
|
||||
def __iter__(self):
|
||||
"""Building on membership check define how to iterate over
|
||||
vocabulary."""
|
||||
return chain(
|
||||
(item for item in self.counts if item in self),
|
||||
[self.unk_label] if self.counts else [],
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
"""Computing size of vocabulary reflects the cutoff."""
|
||||
return sum(1 for _ in self)
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
self.unk_label == other.unk_label
|
||||
and self.cutoff == other.cutoff
|
||||
and self.counts == other.counts
|
||||
)
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
# see https://stackoverflow.com/a/35781654/4501212
|
||||
def __ne__(self, other):
|
||||
equal = self.__eq__(other)
|
||||
return equal if equal is NotImplemented else not equal
|
||||
|
||||
def __str__(self):
|
||||
return "<{0} with cutoff={1} unk_label='{2}' and {3} items>".format(
|
||||
self.__class__.__name__, self.cutoff, self.unk_label, len(self)
|
||||
)
|
||||
Reference in New Issue
Block a user