Initial commit
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,135 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Natural Language Toolkit: Language Model Unit Tests
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import unittest
|
||||
|
||||
import six
|
||||
|
||||
from nltk import FreqDist
|
||||
from nltk.lm import NgramCounter
|
||||
from nltk.util import everygrams
|
||||
|
||||
|
||||
class NgramCounterTests(unittest.TestCase):
|
||||
"""Tests for NgramCounter that only involve lookup, no modification."""
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
|
||||
text = [list("abcd"), list("egdbe")]
|
||||
cls.trigram_counter = NgramCounter(
|
||||
(everygrams(sent, max_len=3) for sent in text)
|
||||
)
|
||||
cls.bigram_counter = NgramCounter(
|
||||
(everygrams(sent, max_len=2) for sent in text)
|
||||
)
|
||||
|
||||
def test_N(self):
|
||||
self.assertEqual(self.bigram_counter.N(), 16)
|
||||
self.assertEqual(self.trigram_counter.N(), 21)
|
||||
|
||||
def test_counter_len_changes_with_lookup(self):
|
||||
self.assertEqual(len(self.bigram_counter), 2)
|
||||
_ = self.bigram_counter[50]
|
||||
self.assertEqual(len(self.bigram_counter), 3)
|
||||
|
||||
def test_ngram_order_access_unigrams(self):
|
||||
self.assertEqual(self.bigram_counter[1], self.bigram_counter.unigrams)
|
||||
|
||||
def test_ngram_conditional_freqdist(self):
|
||||
expected_trigram_contexts = [
|
||||
("a", "b"),
|
||||
("b", "c"),
|
||||
("e", "g"),
|
||||
("g", "d"),
|
||||
("d", "b"),
|
||||
]
|
||||
expected_bigram_contexts = [("a",), ("b",), ("d",), ("e",), ("c",), ("g",)]
|
||||
|
||||
bigrams = self.trigram_counter[2]
|
||||
trigrams = self.trigram_counter[3]
|
||||
|
||||
six.assertCountEqual(self, expected_bigram_contexts, bigrams.conditions())
|
||||
six.assertCountEqual(self, expected_trigram_contexts, trigrams.conditions())
|
||||
|
||||
def test_bigram_counts_seen_ngrams(self):
|
||||
b_given_a_count = 1
|
||||
unk_given_b_count = 1
|
||||
|
||||
self.assertEqual(b_given_a_count, self.bigram_counter[["a"]]["b"])
|
||||
self.assertEqual(unk_given_b_count, self.bigram_counter[["b"]]["c"])
|
||||
|
||||
def test_bigram_counts_unseen_ngrams(self):
|
||||
z_given_b_count = 0
|
||||
|
||||
self.assertEqual(z_given_b_count, self.bigram_counter[["b"]]["z"])
|
||||
|
||||
def test_unigram_counts_seen_words(self):
|
||||
expected_count_b = 2
|
||||
|
||||
self.assertEqual(expected_count_b, self.bigram_counter["b"])
|
||||
|
||||
def test_unigram_counts_completely_unseen_words(self):
|
||||
unseen_count = 0
|
||||
|
||||
self.assertEqual(unseen_count, self.bigram_counter["z"])
|
||||
|
||||
|
||||
class NgramCounterTrainingTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.counter = NgramCounter()
|
||||
|
||||
def test_empty_string(self):
|
||||
test = NgramCounter("")
|
||||
self.assertNotIn(2, test)
|
||||
self.assertEqual(test[1], FreqDist())
|
||||
|
||||
def test_empty_list(self):
|
||||
test = NgramCounter([])
|
||||
self.assertNotIn(2, test)
|
||||
self.assertEqual(test[1], FreqDist())
|
||||
|
||||
def test_None(self):
|
||||
test = NgramCounter(None)
|
||||
self.assertNotIn(2, test)
|
||||
self.assertEqual(test[1], FreqDist())
|
||||
|
||||
def test_train_on_unigrams(self):
|
||||
words = list("abcd")
|
||||
counter = NgramCounter([[(w,) for w in words]])
|
||||
|
||||
self.assertFalse(bool(counter[3]))
|
||||
self.assertFalse(bool(counter[2]))
|
||||
six.assertCountEqual(self, words, counter[1].keys())
|
||||
|
||||
def test_train_on_illegal_sentences(self):
|
||||
str_sent = ["Check", "this", "out", "!"]
|
||||
list_sent = [["Check", "this"], ["this", "out"], ["out", "!"]]
|
||||
|
||||
with self.assertRaises(TypeError):
|
||||
NgramCounter([str_sent])
|
||||
|
||||
with self.assertRaises(TypeError):
|
||||
NgramCounter([list_sent])
|
||||
|
||||
def test_train_on_bigrams(self):
|
||||
bigram_sent = [("a", "b"), ("c", "d")]
|
||||
counter = NgramCounter([bigram_sent])
|
||||
|
||||
self.assertFalse(bool(counter[3]))
|
||||
|
||||
def test_train_on_mix(self):
|
||||
mixed_sent = [("a", "b"), ("c", "d"), ("e", "f", "g"), ("h",)]
|
||||
counter = NgramCounter([mixed_sent])
|
||||
unigrams = ["h"]
|
||||
bigram_contexts = [("a",), ("c",)]
|
||||
trigram_contexts = [("e", "f")]
|
||||
|
||||
six.assertCountEqual(self, unigrams, counter[1].keys())
|
||||
six.assertCountEqual(self, bigram_contexts, counter[2].keys())
|
||||
six.assertCountEqual(self, trigram_contexts, counter[3].keys())
|
||||
@@ -0,0 +1,446 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Natural Language Toolkit: Language Model Unit Tests
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from __future__ import division
|
||||
|
||||
import math
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
from six import add_metaclass
|
||||
|
||||
from nltk.lm import (
|
||||
Vocabulary,
|
||||
MLE,
|
||||
Lidstone,
|
||||
Laplace,
|
||||
WittenBellInterpolated,
|
||||
KneserNeyInterpolated,
|
||||
)
|
||||
from nltk.lm.preprocessing import padded_everygrams
|
||||
|
||||
|
||||
def _prepare_test_data(ngram_order):
|
||||
return (
|
||||
Vocabulary(["a", "b", "c", "d", "z", "<s>", "</s>"], unk_cutoff=1),
|
||||
[
|
||||
list(padded_everygrams(ngram_order, sent))
|
||||
for sent in (list("abcd"), list("egadbe"))
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class ParametrizeTestsMeta(type):
|
||||
"""Metaclass for generating parametrized tests."""
|
||||
|
||||
def __new__(cls, name, bases, dct):
|
||||
contexts = (
|
||||
("a",),
|
||||
("c",),
|
||||
(u"<s>",),
|
||||
("b",),
|
||||
(u"<UNK>",),
|
||||
("d",),
|
||||
("e",),
|
||||
("r",),
|
||||
("w",),
|
||||
)
|
||||
for i, c in enumerate(contexts):
|
||||
dct["test_sumto1_{0}".format(i)] = cls.add_sum_to_1_test(c)
|
||||
scores = dct.get("score_tests", [])
|
||||
for i, (word, context, expected_score) in enumerate(scores):
|
||||
dct["test_score_{0}".format(i)] = cls.add_score_test(
|
||||
word, context, expected_score
|
||||
)
|
||||
return super(ParametrizeTestsMeta, cls).__new__(cls, name, bases, dct)
|
||||
|
||||
@classmethod
|
||||
def add_score_test(cls, word, context, expected_score):
|
||||
if sys.version_info > (3, 5):
|
||||
message = "word='{word}', context={context}"
|
||||
else:
|
||||
# Python 2 doesn't report the mismatched values if we pass a custom
|
||||
# message, so we have to report them manually.
|
||||
message = (
|
||||
"{score} != {expected_score} within 4 places, "
|
||||
"word='{word}', context={context}"
|
||||
)
|
||||
|
||||
def test_method(self):
|
||||
score = self.model.score(word, context)
|
||||
self.assertAlmostEqual(
|
||||
score, expected_score, msg=message.format(**locals()), places=4
|
||||
)
|
||||
|
||||
return test_method
|
||||
|
||||
@classmethod
|
||||
def add_sum_to_1_test(cls, context):
|
||||
def test(self):
|
||||
s = sum(self.model.score(w, context) for w in self.model.vocab)
|
||||
self.assertAlmostEqual(s, 1.0, msg="The context is {}".format(context))
|
||||
|
||||
return test
|
||||
|
||||
|
||||
@add_metaclass(ParametrizeTestsMeta)
|
||||
class MleBigramTests(unittest.TestCase):
|
||||
"""unit tests for MLENgramModel class"""
|
||||
|
||||
score_tests = [
|
||||
("d", ["c"], 1),
|
||||
# Unseen ngrams should yield 0
|
||||
("d", ["e"], 0),
|
||||
# Unigrams should also be 0
|
||||
("z", None, 0),
|
||||
# N unigrams = 14
|
||||
# count('a') = 2
|
||||
("a", None, 2.0 / 14),
|
||||
# count('y') = 3
|
||||
("y", None, 3.0 / 14),
|
||||
]
|
||||
|
||||
def setUp(self):
|
||||
vocab, training_text = _prepare_test_data(2)
|
||||
self.model = MLE(2, vocabulary=vocab)
|
||||
self.model.fit(training_text)
|
||||
|
||||
def test_logscore_zero_score(self):
|
||||
# logscore of unseen ngrams should be -inf
|
||||
logscore = self.model.logscore("d", ["e"])
|
||||
|
||||
self.assertTrue(math.isinf(logscore))
|
||||
|
||||
def test_entropy_perplexity_seen(self):
|
||||
# ngrams seen during training
|
||||
trained = [
|
||||
("<s>", "a"),
|
||||
("a", "b"),
|
||||
("b", "<UNK>"),
|
||||
("<UNK>", "a"),
|
||||
("a", "d"),
|
||||
("d", "</s>"),
|
||||
]
|
||||
# Ngram = Log score
|
||||
# <s>, a = -1
|
||||
# a, b = -1
|
||||
# b, UNK = -1
|
||||
# UNK, a = -1.585
|
||||
# a, d = -1
|
||||
# d, </s> = -1
|
||||
# TOTAL logscores = -6.585
|
||||
# - AVG logscores = 1.0975
|
||||
H = 1.0975
|
||||
perplexity = 2.1398
|
||||
|
||||
self.assertAlmostEqual(H, self.model.entropy(trained), places=4)
|
||||
self.assertAlmostEqual(perplexity, self.model.perplexity(trained), places=4)
|
||||
|
||||
def test_entropy_perplexity_unseen(self):
|
||||
# In MLE, even one unseen ngram should make entropy and perplexity infinite
|
||||
untrained = [("<s>", "a"), ("a", "c"), ("c", "d"), ("d", "</s>")]
|
||||
|
||||
self.assertTrue(math.isinf(self.model.entropy(untrained)))
|
||||
self.assertTrue(math.isinf(self.model.perplexity(untrained)))
|
||||
|
||||
def test_entropy_perplexity_unigrams(self):
|
||||
# word = score, log score
|
||||
# <s> = 0.1429, -2.8074
|
||||
# a = 0.1429, -2.8074
|
||||
# c = 0.0714, -3.8073
|
||||
# UNK = 0.2143, -2.2224
|
||||
# d = 0.1429, -2.8074
|
||||
# c = 0.0714, -3.8073
|
||||
# </s> = 0.1429, -2.8074
|
||||
# TOTAL logscores = -21.6243
|
||||
# - AVG logscores = 3.0095
|
||||
H = 3.0095
|
||||
perplexity = 8.0529
|
||||
|
||||
text = [("<s>",), ("a",), ("c",), ("-",), ("d",), ("c",), ("</s>",)]
|
||||
|
||||
self.assertAlmostEqual(H, self.model.entropy(text), places=4)
|
||||
self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
|
||||
|
||||
|
||||
@add_metaclass(ParametrizeTestsMeta)
|
||||
class MleTrigramTests(unittest.TestCase):
|
||||
"""MLE trigram model tests"""
|
||||
|
||||
score_tests = [
|
||||
# count(d | b, c) = 1
|
||||
# count(b, c) = 1
|
||||
("d", ("b", "c"), 1),
|
||||
# count(d | c) = 1
|
||||
# count(c) = 1
|
||||
("d", ["c"], 1),
|
||||
# total number of tokens is 18, of which "a" occured 2 times
|
||||
("a", None, 2.0 / 18),
|
||||
# in vocabulary but unseen
|
||||
("z", None, 0),
|
||||
# out of vocabulary should use "UNK" score
|
||||
("y", None, 3.0 / 18),
|
||||
]
|
||||
|
||||
def setUp(self):
|
||||
vocab, training_text = _prepare_test_data(3)
|
||||
self.model = MLE(3, vocabulary=vocab)
|
||||
self.model.fit(training_text)
|
||||
|
||||
|
||||
@add_metaclass(ParametrizeTestsMeta)
|
||||
class LidstoneBigramTests(unittest.TestCase):
|
||||
"""unit tests for Lidstone class"""
|
||||
|
||||
score_tests = [
|
||||
# count(d | c) = 1
|
||||
# *count(d | c) = 1.1
|
||||
# Count(w | c for w in vocab) = 1
|
||||
# *Count(w | c for w in vocab) = 1.8
|
||||
("d", ["c"], 1.1 / 1.8),
|
||||
# Total unigrams: 14
|
||||
# Vocab size: 8
|
||||
# Denominator: 14 + 0.8 = 14.8
|
||||
# count("a") = 2
|
||||
# *count("a") = 2.1
|
||||
("a", None, 2.1 / 14.8),
|
||||
# in vocabulary but unseen
|
||||
# count("z") = 0
|
||||
# *count("z") = 0.1
|
||||
("z", None, 0.1 / 14.8),
|
||||
# out of vocabulary should use "UNK" score
|
||||
# count("<UNK>") = 3
|
||||
# *count("<UNK>") = 3.1
|
||||
("y", None, 3.1 / 14.8),
|
||||
]
|
||||
|
||||
def setUp(self):
|
||||
vocab, training_text = _prepare_test_data(2)
|
||||
self.model = Lidstone(0.1, 2, vocabulary=vocab)
|
||||
self.model.fit(training_text)
|
||||
|
||||
def test_gamma(self):
|
||||
self.assertEqual(0.1, self.model.gamma)
|
||||
|
||||
def test_entropy_perplexity(self):
|
||||
text = [
|
||||
("<s>", "a"),
|
||||
("a", "c"),
|
||||
("c", "<UNK>"),
|
||||
("<UNK>", "d"),
|
||||
("d", "c"),
|
||||
("c", "</s>"),
|
||||
]
|
||||
# Unlike MLE this should be able to handle completely novel ngrams
|
||||
# Ngram = score, log score
|
||||
# <s>, a = 0.3929, -1.3479
|
||||
# a, c = 0.0357, -4.8074
|
||||
# c, UNK = 0.0(5), -4.1699
|
||||
# UNK, d = 0.0263, -5.2479
|
||||
# d, c = 0.0357, -4.8074
|
||||
# c, </s> = 0.0(5), -4.1699
|
||||
# TOTAL logscore: −24.5504
|
||||
# - AVG logscore: 4.0917
|
||||
H = 4.0917
|
||||
perplexity = 17.0504
|
||||
self.assertAlmostEqual(H, self.model.entropy(text), places=4)
|
||||
self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
|
||||
|
||||
|
||||
@add_metaclass(ParametrizeTestsMeta)
|
||||
class LidstoneTrigramTests(unittest.TestCase):
|
||||
score_tests = [
|
||||
# Logic behind this is the same as for bigram model
|
||||
("d", ["c"], 1.1 / 1.8),
|
||||
# if we choose a word that hasn't appeared after (b, c)
|
||||
("e", ["c"], 0.1 / 1.8),
|
||||
# Trigram score now
|
||||
("d", ["b", "c"], 1.1 / 1.8),
|
||||
("e", ["b", "c"], 0.1 / 1.8),
|
||||
]
|
||||
|
||||
def setUp(self):
|
||||
vocab, training_text = _prepare_test_data(3)
|
||||
self.model = Lidstone(0.1, 3, vocabulary=vocab)
|
||||
self.model.fit(training_text)
|
||||
|
||||
|
||||
@add_metaclass(ParametrizeTestsMeta)
|
||||
class LaplaceBigramTests(unittest.TestCase):
|
||||
"""unit tests for Laplace class"""
|
||||
|
||||
score_tests = [
|
||||
# basic sanity-check:
|
||||
# count(d | c) = 1
|
||||
# *count(d | c) = 2
|
||||
# Count(w | c for w in vocab) = 1
|
||||
# *Count(w | c for w in vocab) = 9
|
||||
("d", ["c"], 2.0 / 9),
|
||||
# Total unigrams: 14
|
||||
# Vocab size: 8
|
||||
# Denominator: 14 + 8 = 22
|
||||
# count("a") = 2
|
||||
# *count("a") = 3
|
||||
("a", None, 3.0 / 22),
|
||||
# in vocabulary but unseen
|
||||
# count("z") = 0
|
||||
# *count("z") = 1
|
||||
("z", None, 1.0 / 22),
|
||||
# out of vocabulary should use "UNK" score
|
||||
# count("<UNK>") = 3
|
||||
# *count("<UNK>") = 4
|
||||
("y", None, 4.0 / 22),
|
||||
]
|
||||
|
||||
def setUp(self):
|
||||
vocab, training_text = _prepare_test_data(2)
|
||||
self.model = Laplace(2, vocabulary=vocab)
|
||||
self.model.fit(training_text)
|
||||
|
||||
def test_gamma(self):
|
||||
# Make sure the gamma is set to 1
|
||||
self.assertEqual(1, self.model.gamma)
|
||||
|
||||
def test_entropy_perplexity(self):
|
||||
text = [
|
||||
("<s>", "a"),
|
||||
("a", "c"),
|
||||
("c", "<UNK>"),
|
||||
("<UNK>", "d"),
|
||||
("d", "c"),
|
||||
("c", "</s>"),
|
||||
]
|
||||
# Unlike MLE this should be able to handle completely novel ngrams
|
||||
# Ngram = score, log score
|
||||
# <s>, a = 0.2, -2.3219
|
||||
# a, c = 0.1, -3.3219
|
||||
# c, UNK = 0.(1), -3.1699
|
||||
# UNK, d = 0.(09), 3.4594
|
||||
# d, c = 0.1 -3.3219
|
||||
# c, </s> = 0.(1), -3.1699
|
||||
# Total logscores: −18.7651
|
||||
# - AVG logscores: 3.1275
|
||||
H = 3.1275
|
||||
perplexity = 8.7393
|
||||
self.assertAlmostEqual(H, self.model.entropy(text), places=4)
|
||||
self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
|
||||
|
||||
|
||||
@add_metaclass(ParametrizeTestsMeta)
|
||||
class WittenBellInterpolatedTrigramTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
vocab, training_text = _prepare_test_data(3)
|
||||
self.model = WittenBellInterpolated(3, vocabulary=vocab)
|
||||
self.model.fit(training_text)
|
||||
|
||||
score_tests = [
|
||||
# For unigram scores by default revert to MLE
|
||||
# Total unigrams: 18
|
||||
# count('c'): 1
|
||||
("c", None, 1.0 / 18),
|
||||
# in vocabulary but unseen
|
||||
# count("z") = 0
|
||||
("z", None, 0.0 / 18),
|
||||
# out of vocabulary should use "UNK" score
|
||||
# count("<UNK>") = 3
|
||||
("y", None, 3.0 / 18),
|
||||
# gamma(['b']) = 0.1111
|
||||
# mle.score('c', ['b']) = 0.5
|
||||
# (1 - gamma) * mle + gamma * mle('c') ~= 0.45 + .3 / 18
|
||||
("c", ["b"], (1 - 0.1111) * 0.5 + 0.1111 * 1 / 18),
|
||||
# building on that, let's try 'a b c' as the trigram
|
||||
# gamma(['a', 'b']) = 0.0667
|
||||
# mle("c", ["a", "b"]) = 1
|
||||
("c", ["a", "b"], (1 - 0.0667) + 0.0667 * ((1 - 0.1111) * 0.5 + 0.1111 / 18)),
|
||||
]
|
||||
|
||||
|
||||
@add_metaclass(ParametrizeTestsMeta)
|
||||
class KneserNeyInterpolatedTrigramTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
vocab, training_text = _prepare_test_data(3)
|
||||
self.model = KneserNeyInterpolated(3, vocabulary=vocab)
|
||||
self.model.fit(training_text)
|
||||
|
||||
score_tests = [
|
||||
# For unigram scores revert to uniform
|
||||
# Vocab size: 8
|
||||
# count('c'): 1
|
||||
("c", None, 1.0 / 8),
|
||||
# in vocabulary but unseen, still uses uniform
|
||||
("z", None, 1 / 8),
|
||||
# out of vocabulary should use "UNK" score, i.e. again uniform
|
||||
("y", None, 1.0 / 8),
|
||||
# alpha = count('bc') - discount = 1 - 0.1 = 0.9
|
||||
# gamma(['b']) = discount * number of unique words that follow ['b'] = 0.1 * 2
|
||||
# normalizer = total number of bigrams with this context = 2
|
||||
# the final should be: (alpha + gamma * unigram_score("c"))
|
||||
("c", ["b"], (0.9 + 0.2 * (1 / 8)) / 2),
|
||||
# building on that, let's try 'a b c' as the trigram
|
||||
# alpha = count('abc') - discount = 1 - 0.1 = 0.9
|
||||
# gamma(['a', 'b']) = 0.1 * 1
|
||||
# normalizer = total number of trigrams with prefix "ab" = 1 => we can ignore it!
|
||||
("c", ["a", "b"], 0.9 + 0.1 * ((0.9 + 0.2 * (1 / 8)) / 2)),
|
||||
]
|
||||
|
||||
|
||||
class NgramModelTextGenerationTests(unittest.TestCase):
|
||||
"""Using MLE estimator, generate some text."""
|
||||
|
||||
def setUp(self):
|
||||
vocab, training_text = _prepare_test_data(3)
|
||||
self.model = MLE(3, vocabulary=vocab)
|
||||
self.model.fit(training_text)
|
||||
|
||||
def test_generate_one_no_context(self):
|
||||
self.assertEqual(self.model.generate(random_seed=3), "<UNK>")
|
||||
|
||||
def test_generate_one_limiting_context(self):
|
||||
# We don't need random_seed for contexts with only one continuation
|
||||
self.assertEqual(self.model.generate(text_seed=["c"]), "d")
|
||||
self.assertEqual(self.model.generate(text_seed=["b", "c"]), "d")
|
||||
self.assertEqual(self.model.generate(text_seed=["a", "c"]), "d")
|
||||
|
||||
def test_generate_one_varied_context(self):
|
||||
# When context doesn't limit our options enough, seed the random choice
|
||||
self.assertEqual(
|
||||
self.model.generate(text_seed=("a", "<s>"), random_seed=2), "a"
|
||||
)
|
||||
|
||||
def test_generate_cycle(self):
|
||||
# Add a cycle to the model: bd -> b, db -> d
|
||||
more_training_text = [list(padded_everygrams(self.model.order, list("bdbdbd")))]
|
||||
self.model.fit(more_training_text)
|
||||
# Test that we can escape the cycle
|
||||
self.assertEqual(
|
||||
self.model.generate(7, text_seed=("b", "d"), random_seed=5),
|
||||
["b", "d", "b", "d", "b", "d", "</s>"],
|
||||
)
|
||||
|
||||
def test_generate_with_text_seed(self):
|
||||
self.assertEqual(
|
||||
self.model.generate(5, text_seed=("<s>", "e"), random_seed=3),
|
||||
["<UNK>", "a", "d", "b", "<UNK>"],
|
||||
)
|
||||
|
||||
def test_generate_oov_text_seed(self):
|
||||
self.assertEqual(
|
||||
self.model.generate(text_seed=("aliens",), random_seed=3),
|
||||
self.model.generate(text_seed=("<UNK>",), random_seed=3),
|
||||
)
|
||||
|
||||
def test_generate_None_text_seed(self):
|
||||
# should crash with type error when we try to look it up in vocabulary
|
||||
with self.assertRaises(TypeError):
|
||||
self.model.generate(text_seed=(None,))
|
||||
|
||||
# This will work
|
||||
self.assertEqual(
|
||||
self.model.generate(text_seed=None, random_seed=3),
|
||||
self.model.generate(random_seed=3),
|
||||
)
|
||||
@@ -0,0 +1,31 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Natural Language Toolkit: Language Model Unit Tests
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
import unittest
|
||||
|
||||
from nltk.lm.preprocessing import padded_everygram_pipeline
|
||||
|
||||
|
||||
class TestPreprocessing(unittest.TestCase):
|
||||
def test_padded_everygram_pipeline(self):
|
||||
expected_train = [
|
||||
[
|
||||
("<s>",),
|
||||
("a",),
|
||||
("b",),
|
||||
("c",),
|
||||
("</s>",),
|
||||
("<s>", "a"),
|
||||
("a", "b"),
|
||||
("b", "c"),
|
||||
("c", "</s>"),
|
||||
]
|
||||
]
|
||||
expected_vocab = ["<s>", "a", "b", "c", "</s>"]
|
||||
train_data, vocab_data = padded_everygram_pipeline(2, [["a", "b", "c"]])
|
||||
self.assertEqual([list(sent) for sent in train_data], expected_train)
|
||||
self.assertEqual(list(vocab_data), expected_vocab)
|
||||
@@ -0,0 +1,141 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Natural Language Toolkit: Language Model Unit Tests
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import unittest
|
||||
from collections import Counter
|
||||
|
||||
import six
|
||||
from nltk.lm import Vocabulary
|
||||
|
||||
|
||||
class NgramModelVocabularyTests(unittest.TestCase):
|
||||
"""tests Vocabulary Class"""
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.vocab = Vocabulary(
|
||||
["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"],
|
||||
unk_cutoff=2,
|
||||
)
|
||||
|
||||
def test_truthiness(self):
|
||||
self.assertTrue(self.vocab)
|
||||
|
||||
def test_cutoff_value_set_correctly(self):
|
||||
self.assertEqual(self.vocab.cutoff, 2)
|
||||
|
||||
def test_unable_to_change_cutoff(self):
|
||||
with self.assertRaises(AttributeError):
|
||||
self.vocab.cutoff = 3
|
||||
|
||||
def test_cutoff_setter_checks_value(self):
|
||||
with self.assertRaises(ValueError) as exc_info:
|
||||
Vocabulary("abc", unk_cutoff=0)
|
||||
expected_error_msg = "Cutoff value cannot be less than 1. Got: 0"
|
||||
self.assertEqual(expected_error_msg, str(exc_info.exception))
|
||||
|
||||
def test_counts_set_correctly(self):
|
||||
self.assertEqual(self.vocab.counts["a"], 2)
|
||||
self.assertEqual(self.vocab.counts["b"], 2)
|
||||
self.assertEqual(self.vocab.counts["c"], 1)
|
||||
|
||||
def test_membership_check_respects_cutoff(self):
|
||||
# a was seen 2 times, so it should be considered part of the vocabulary
|
||||
self.assertTrue("a" in self.vocab)
|
||||
# "c" was seen once, it shouldn't be considered part of the vocab
|
||||
self.assertFalse("c" in self.vocab)
|
||||
# "z" was never seen at all, also shouldn't be considered in the vocab
|
||||
self.assertFalse("z" in self.vocab)
|
||||
|
||||
def test_vocab_len_respects_cutoff(self):
|
||||
# Vocab size is the number of unique tokens that occur at least as often
|
||||
# as the cutoff value, plus 1 to account for unknown words.
|
||||
self.assertEqual(5, len(self.vocab))
|
||||
|
||||
def test_vocab_iter_respects_cutoff(self):
|
||||
vocab_counts = ["a", "b", "c", "d", "e", "f", "g", "w", "z"]
|
||||
vocab_items = ["a", "b", "d", "e", "<UNK>"]
|
||||
|
||||
six.assertCountEqual(self, vocab_counts, list(self.vocab.counts.keys()))
|
||||
six.assertCountEqual(self, vocab_items, list(self.vocab))
|
||||
|
||||
def test_update_empty_vocab(self):
|
||||
empty = Vocabulary(unk_cutoff=2)
|
||||
self.assertEqual(len(empty), 0)
|
||||
self.assertFalse(empty)
|
||||
self.assertIn(empty.unk_label, empty)
|
||||
|
||||
empty.update(list("abcde"))
|
||||
self.assertIn(empty.unk_label, empty)
|
||||
|
||||
def test_lookup(self):
|
||||
self.assertEqual(self.vocab.lookup("a"), "a")
|
||||
self.assertEqual(self.vocab.lookup("c"), "<UNK>")
|
||||
|
||||
def test_lookup_iterables(self):
|
||||
self.assertEqual(self.vocab.lookup(["a", "b"]), ("a", "b"))
|
||||
self.assertEqual(self.vocab.lookup(("a", "b")), ("a", "b"))
|
||||
self.assertEqual(self.vocab.lookup(("a", "c")), ("a", "<UNK>"))
|
||||
self.assertEqual(
|
||||
self.vocab.lookup(map(str, range(3))), ("<UNK>", "<UNK>", "<UNK>")
|
||||
)
|
||||
|
||||
def test_lookup_empty_iterables(self):
|
||||
self.assertEqual(self.vocab.lookup(()), ())
|
||||
self.assertEqual(self.vocab.lookup([]), ())
|
||||
self.assertEqual(self.vocab.lookup(iter([])), ())
|
||||
self.assertEqual(self.vocab.lookup(n for n in range(0, 0)), ())
|
||||
|
||||
def test_lookup_recursive(self):
|
||||
self.assertEqual(
|
||||
self.vocab.lookup([["a", "b"], ["a", "c"]]), (("a", "b"), ("a", "<UNK>"))
|
||||
)
|
||||
self.assertEqual(self.vocab.lookup([["a", "b"], "c"]), (("a", "b"), "<UNK>"))
|
||||
self.assertEqual(self.vocab.lookup([[[[["a", "b"]]]]]), ((((("a", "b"),),),),))
|
||||
|
||||
def test_lookup_None(self):
|
||||
with self.assertRaises(TypeError):
|
||||
self.vocab.lookup(None)
|
||||
with self.assertRaises(TypeError):
|
||||
list(self.vocab.lookup([None, None]))
|
||||
|
||||
def test_lookup_int(self):
|
||||
with self.assertRaises(TypeError):
|
||||
self.vocab.lookup(1)
|
||||
with self.assertRaises(TypeError):
|
||||
list(self.vocab.lookup([1, 2]))
|
||||
|
||||
def test_lookup_empty_str(self):
|
||||
self.assertEqual(self.vocab.lookup(""), "<UNK>")
|
||||
|
||||
def test_eqality(self):
|
||||
v1 = Vocabulary(["a", "b", "c"], unk_cutoff=1)
|
||||
v2 = Vocabulary(["a", "b", "c"], unk_cutoff=1)
|
||||
v3 = Vocabulary(["a", "b", "c"], unk_cutoff=1, unk_label="blah")
|
||||
v4 = Vocabulary(["a", "b"], unk_cutoff=1)
|
||||
|
||||
self.assertEqual(v1, v2)
|
||||
self.assertNotEqual(v1, v3)
|
||||
self.assertNotEqual(v1, v4)
|
||||
|
||||
def test_str(self):
|
||||
self.assertEqual(
|
||||
str(self.vocab),
|
||||
("<Vocabulary with cutoff=2 " "unk_label='<UNK>' and 5 items>"),
|
||||
)
|
||||
|
||||
def test_creation_with_counter(self):
|
||||
self.assertEqual(
|
||||
self.vocab,
|
||||
Vocabulary(
|
||||
Counter(
|
||||
["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"]
|
||||
),
|
||||
unk_cutoff=2,
|
||||
),
|
||||
)
|
||||
@@ -0,0 +1,65 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Unit tests for nltk.compat.
|
||||
See also nltk/test/compat.doctest.
|
||||
"""
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
import unittest
|
||||
|
||||
from nltk.text import Text
|
||||
from nltk.compat import PY3, python_2_unicode_compatible
|
||||
|
||||
|
||||
def setup_module(module):
|
||||
from nose import SkipTest
|
||||
|
||||
if PY3:
|
||||
raise SkipTest("test_2x_compat is for testing nltk.compat under Python 2.x")
|
||||
|
||||
|
||||
class TestTextTransliteration(unittest.TestCase):
|
||||
txt = Text(["São", "Tomé", "and", "Príncipe"])
|
||||
|
||||
def test_repr(self):
|
||||
self.assertEqual(repr(self.txt), br"<Text: S\xe3o Tom\xe9 and Pr\xedncipe...>")
|
||||
|
||||
def test_str(self):
|
||||
self.assertEqual(str(self.txt), b"<Text: Sao Tome and Principe...>")
|
||||
|
||||
|
||||
class TestFraction(unittest.TestCase):
|
||||
def test_unnoramlize_fraction(self):
|
||||
from fractions import Fraction as NativePythonFraction
|
||||
from nltk.compat import Fraction as NLTKFraction
|
||||
|
||||
# The native fraction should throw a TypeError in Python < 3.5
|
||||
with self.assertRaises(TypeError):
|
||||
NativePythonFraction(0, 1000, _normalize=False)
|
||||
|
||||
# Using nltk.compat.Fraction in Python < 3.5
|
||||
compat_frac = NLTKFraction(0, 1000, _normalize=False)
|
||||
# The numerator and denominator does not change.
|
||||
assert compat_frac.numerator == 0
|
||||
assert compat_frac.denominator == 1000
|
||||
# The floating point value remains normalized.
|
||||
assert float(compat_frac) == 0.0
|
||||
|
||||
# Checks that the division is not divided by
|
||||
# # by greatest common divisor (gcd).
|
||||
six_twelve = NLTKFraction(6, 12, _normalize=False)
|
||||
assert six_twelve.numerator == 6
|
||||
assert six_twelve.denominator == 12
|
||||
|
||||
one_two = NLTKFraction(1, 2, _normalize=False)
|
||||
assert one_two.numerator == 1
|
||||
assert one_two.denominator == 2
|
||||
|
||||
# Checks against the native fraction.
|
||||
six_twelve_original = NativePythonFraction(6, 12)
|
||||
# Checks that rational values of one_two and six_twelve is the same.
|
||||
assert float(one_two) == float(six_twelve) == float(six_twelve_original)
|
||||
|
||||
# Checks that the fraction does get normalized, even when
|
||||
# _normalize == False when numerator is using native
|
||||
# fractions.Fraction.from_float
|
||||
assert NLTKFraction(3.142, _normalize=False) == NativePythonFraction(3.142)
|
||||
@@ -0,0 +1,78 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Unit tests for nltk.metrics.aline
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import unittest
|
||||
|
||||
from nltk.metrics import aline
|
||||
|
||||
|
||||
class TestAline(unittest.TestCase):
|
||||
"""
|
||||
Test Aline algorithm for aligning phonetic sequences
|
||||
"""
|
||||
|
||||
def test_aline(self):
|
||||
result = aline.align('θin', 'tenwis')
|
||||
expected = [
|
||||
[('θ', 't'), ('i', 'e'), ('n', 'n'), ('-', 'w'), ('-', 'i'), ('-', 's')]
|
||||
]
|
||||
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
result = aline.align('jo', 'ʒə')
|
||||
expected = [[('j', 'ʒ'), ('o', 'ə')]]
|
||||
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
result = aline.align('pematesiweni', 'pematesewen')
|
||||
expected = [
|
||||
[
|
||||
('p', 'p'),
|
||||
('e', 'e'),
|
||||
('m', 'm'),
|
||||
('a', 'a'),
|
||||
('t', 't'),
|
||||
('e', 'e'),
|
||||
('s', 's'),
|
||||
('i', 'e'),
|
||||
('w', 'w'),
|
||||
('e', 'e'),
|
||||
('n', 'n'),
|
||||
('i', '-'),
|
||||
]
|
||||
]
|
||||
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
result = aline.align('tuwθ', 'dentis')
|
||||
expected = [
|
||||
[
|
||||
('t', 'd'),
|
||||
('u', 'e'),
|
||||
('w', '-'),
|
||||
('-', 'n'),
|
||||
('-', 't'),
|
||||
('-', 'i'),
|
||||
('θ', 's'),
|
||||
]
|
||||
]
|
||||
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
def test_aline_delta(self):
|
||||
"""
|
||||
Test aline for computing the difference between two segments
|
||||
"""
|
||||
result = aline.delta('p', 'q')
|
||||
expected = 20.0
|
||||
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
result = aline.delta('a', 'A')
|
||||
expected = 0.0
|
||||
|
||||
self.assertEqual(result, expected)
|
||||
@@ -0,0 +1,37 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for Brill tagger.
|
||||
"""
|
||||
|
||||
import unittest
|
||||
|
||||
from nltk.tag import UnigramTagger, brill, brill_trainer
|
||||
from nltk.tbl import Template
|
||||
from nltk.corpus import treebank
|
||||
|
||||
from nltk.tbl import demo
|
||||
|
||||
|
||||
class TestBrill(unittest.TestCase):
|
||||
def test_pos_template(self):
|
||||
train_sents = treebank.tagged_sents()[:1000]
|
||||
tagger = UnigramTagger(train_sents)
|
||||
trainer = brill_trainer.BrillTaggerTrainer(
|
||||
tagger, [brill.Template(brill.Pos([-1]))]
|
||||
)
|
||||
brill_tagger = trainer.train(train_sents)
|
||||
# Example from https://github.com/nltk/nltk/issues/769
|
||||
result = brill_tagger.tag('This is a foo bar sentence'.split())
|
||||
expected = [
|
||||
('This', 'DT'),
|
||||
('is', 'VBZ'),
|
||||
('a', 'DT'),
|
||||
('foo', None),
|
||||
('bar', 'NN'),
|
||||
('sentence', None),
|
||||
]
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
@unittest.skip("Should be tested in __main__ of nltk.tbl.demo")
|
||||
def test_brill_demo(self):
|
||||
demo()
|
||||
@@ -0,0 +1,39 @@
|
||||
import unittest
|
||||
from nltk import ConditionalFreqDist, tokenize
|
||||
|
||||
class TestEmptyCondFreq(unittest.TestCase):
|
||||
def test_tabulate(self):
|
||||
empty = ConditionalFreqDist()
|
||||
self.assertEqual(empty.conditions(),[])
|
||||
try:
|
||||
empty.tabulate(conditions="BUG") # nonexistent keys shouldn't be added
|
||||
except:
|
||||
pass
|
||||
self.assertEqual(empty.conditions(), [])
|
||||
|
||||
|
||||
def test_plot(self):
|
||||
empty = ConditionalFreqDist()
|
||||
self.assertEqual(empty.conditions(),[])
|
||||
try:
|
||||
empty.plot(conditions=["BUG"]) # nonexistent keys shouldn't be added
|
||||
except:
|
||||
pass
|
||||
self.assertEqual(empty.conditions(),[])
|
||||
|
||||
def test_increment(self):
|
||||
# make sure that we can still mutate cfd normally
|
||||
text = "cow cat mouse cat tiger"
|
||||
cfd = ConditionalFreqDist()
|
||||
|
||||
# create cfd with word length as condition
|
||||
for word in tokenize.word_tokenize(text):
|
||||
condition = len(word)
|
||||
cfd[condition][word] += 1
|
||||
|
||||
self.assertEqual(cfd.conditions(), [3,5])
|
||||
|
||||
# incrementing previously unseen key is still possible
|
||||
cfd[2]['hi'] += 1
|
||||
self.assertEqual(set(cfd.conditions()),set([3,5,2])) # new condition added
|
||||
self.assertEqual(cfd[2]['hi'], 1) # key's frequency incremented from 0 (unseen) to 1
|
||||
@@ -0,0 +1,49 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import unittest
|
||||
import nltk
|
||||
from nltk.grammar import CFG
|
||||
|
||||
|
||||
class ChomskyNormalFormForCFGTest(unittest.TestCase):
|
||||
def test_simple(self):
|
||||
grammar = CFG.fromstring(
|
||||
"""
|
||||
S -> NP VP
|
||||
PP -> P NP
|
||||
NP -> Det N | NP PP P
|
||||
VP -> V NP | VP PP
|
||||
VP -> Det
|
||||
Det -> 'a' | 'the'
|
||||
N -> 'dog' | 'cat'
|
||||
V -> 'chased' | 'sat'
|
||||
P -> 'on' | 'in'
|
||||
"""
|
||||
)
|
||||
self.assertFalse(grammar.is_flexible_chomsky_normal_form())
|
||||
self.assertFalse(grammar.is_chomsky_normal_form())
|
||||
grammar = grammar.chomsky_normal_form(flexible=True)
|
||||
self.assertTrue(grammar.is_flexible_chomsky_normal_form())
|
||||
self.assertFalse(grammar.is_chomsky_normal_form())
|
||||
|
||||
grammar2 = CFG.fromstring(
|
||||
"""
|
||||
S -> NP VP
|
||||
NP -> VP N P
|
||||
VP -> P
|
||||
N -> 'dog' | 'cat'
|
||||
P -> 'on' | 'in'
|
||||
"""
|
||||
)
|
||||
self.assertFalse(grammar2.is_flexible_chomsky_normal_form())
|
||||
self.assertFalse(grammar2.is_chomsky_normal_form())
|
||||
grammar2 = grammar2.chomsky_normal_form()
|
||||
self.assertTrue(grammar2.is_flexible_chomsky_normal_form())
|
||||
self.assertTrue(grammar2.is_chomsky_normal_form())
|
||||
|
||||
def test_complex(self):
|
||||
grammar = nltk.data.load('grammars/large_grammars/atis.cfg')
|
||||
self.assertFalse(grammar.is_flexible_chomsky_normal_form())
|
||||
self.assertFalse(grammar.is_chomsky_normal_form())
|
||||
grammar = grammar.chomsky_normal_form(flexible=True)
|
||||
self.assertTrue(grammar.is_flexible_chomsky_normal_form())
|
||||
self.assertFalse(grammar.is_chomsky_normal_form())
|
||||
@@ -0,0 +1,87 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
import unittest
|
||||
|
||||
from nltk import RegexpParser
|
||||
|
||||
|
||||
class TestChunkRule(unittest.TestCase):
|
||||
def test_tag_pattern2re_pattern_quantifier(self):
|
||||
"""Test for bug https://github.com/nltk/nltk/issues/1597
|
||||
|
||||
Ensures that curly bracket quantifiers can be used inside a chunk rule.
|
||||
This type of quantifier has been used for the supplementary example
|
||||
in http://www.nltk.org/book/ch07.html#exploring-text-corpora.
|
||||
"""
|
||||
sent = [
|
||||
('The', 'AT'),
|
||||
('September-October', 'NP'),
|
||||
('term', 'NN'),
|
||||
('jury', 'NN'),
|
||||
('had', 'HVD'),
|
||||
('been', 'BEN'),
|
||||
('charged', 'VBN'),
|
||||
('by', 'IN'),
|
||||
('Fulton', 'NP-TL'),
|
||||
('Superior', 'JJ-TL'),
|
||||
('Court', 'NN-TL'),
|
||||
('Judge', 'NN-TL'),
|
||||
('Durwood', 'NP'),
|
||||
('Pye', 'NP'),
|
||||
('to', 'TO'),
|
||||
('investigate', 'VB'),
|
||||
('reports', 'NNS'),
|
||||
('of', 'IN'),
|
||||
('possible', 'JJ'),
|
||||
('``', '``'),
|
||||
('irregularities', 'NNS'),
|
||||
("''", "''"),
|
||||
('in', 'IN'),
|
||||
('the', 'AT'),
|
||||
('hard-fought', 'JJ'),
|
||||
('primary', 'NN'),
|
||||
('which', 'WDT'),
|
||||
('was', 'BEDZ'),
|
||||
('won', 'VBN'),
|
||||
('by', 'IN'),
|
||||
('Mayor-nominate', 'NN-TL'),
|
||||
('Ivan', 'NP'),
|
||||
('Allen', 'NP'),
|
||||
('Jr.', 'NP'),
|
||||
('.', '.'),
|
||||
] # source: brown corpus
|
||||
cp = RegexpParser('CHUNK: {<N.*>{4,}}')
|
||||
tree = cp.parse(sent)
|
||||
assert (
|
||||
tree.pformat()
|
||||
== """(S
|
||||
The/AT
|
||||
September-October/NP
|
||||
term/NN
|
||||
jury/NN
|
||||
had/HVD
|
||||
been/BEN
|
||||
charged/VBN
|
||||
by/IN
|
||||
Fulton/NP-TL
|
||||
Superior/JJ-TL
|
||||
(CHUNK Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP)
|
||||
to/TO
|
||||
investigate/VB
|
||||
reports/NNS
|
||||
of/IN
|
||||
possible/JJ
|
||||
``/``
|
||||
irregularities/NNS
|
||||
''/''
|
||||
in/IN
|
||||
the/AT
|
||||
hard-fought/JJ
|
||||
primary/NN
|
||||
which/WDT
|
||||
was/BEDZ
|
||||
won/VBN
|
||||
by/IN
|
||||
(CHUNK Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP)
|
||||
./.)"""
|
||||
)
|
||||
@@ -0,0 +1,50 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Unit tests for nltk.classify. See also: nltk/test/classify.doctest
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from nose import SkipTest
|
||||
from nltk import classify
|
||||
|
||||
TRAIN = [
|
||||
(dict(a=1, b=1, c=1), 'y'),
|
||||
(dict(a=1, b=1, c=1), 'x'),
|
||||
(dict(a=1, b=1, c=0), 'y'),
|
||||
(dict(a=0, b=1, c=1), 'x'),
|
||||
(dict(a=0, b=1, c=1), 'y'),
|
||||
(dict(a=0, b=0, c=1), 'y'),
|
||||
(dict(a=0, b=1, c=0), 'x'),
|
||||
(dict(a=0, b=0, c=0), 'x'),
|
||||
(dict(a=0, b=1, c=1), 'y'),
|
||||
]
|
||||
|
||||
TEST = [
|
||||
(dict(a=1, b=0, c=1)), # unseen
|
||||
(dict(a=1, b=0, c=0)), # unseen
|
||||
(dict(a=0, b=1, c=1)), # seen 3 times, labels=y,y,x
|
||||
(dict(a=0, b=1, c=0)), # seen 1 time, label=x
|
||||
]
|
||||
|
||||
RESULTS = [(0.16, 0.84), (0.46, 0.54), (0.41, 0.59), (0.76, 0.24)]
|
||||
|
||||
|
||||
def assert_classifier_correct(algorithm):
|
||||
try:
|
||||
classifier = classify.MaxentClassifier.train(
|
||||
TRAIN, algorithm, trace=0, max_iter=1000
|
||||
)
|
||||
except (LookupError, AttributeError) as e:
|
||||
raise SkipTest(str(e))
|
||||
|
||||
for (px, py), featureset in zip(RESULTS, TEST):
|
||||
pdist = classifier.prob_classify(featureset)
|
||||
assert abs(pdist.prob('x') - px) < 1e-2, (pdist.prob('x'), px)
|
||||
assert abs(pdist.prob('y') - py) < 1e-2, (pdist.prob('y'), py)
|
||||
|
||||
|
||||
def test_megam():
|
||||
assert_classifier_correct('MEGAM')
|
||||
|
||||
|
||||
def test_tadm():
|
||||
assert_classifier_correct('TADM')
|
||||
@@ -0,0 +1,159 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
import unittest
|
||||
|
||||
from nltk.collocations import BigramCollocationFinder
|
||||
from nltk.metrics import BigramAssocMeasures
|
||||
|
||||
## Test bigram counters with discontinuous bigrams and repeated words
|
||||
|
||||
_EPSILON = 1e-8
|
||||
|
||||
|
||||
def close_enough(x, y):
|
||||
"""Verify that two sequences of n-gram association values are within
|
||||
_EPSILON of each other.
|
||||
"""
|
||||
|
||||
for (x1, y1) in zip(x, y):
|
||||
if x1[0] != y1[0] or abs(x1[1] - y1[1]) > _EPSILON:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
class TestBigram(unittest.TestCase):
|
||||
def test_bigram2(self):
|
||||
sent = 'this this is is a a test test'.split()
|
||||
|
||||
b = BigramCollocationFinder.from_words(sent)
|
||||
|
||||
# python 2.6 does not have assertItemsEqual or assertListEqual
|
||||
self.assertEqual(
|
||||
sorted(b.ngram_fd.items()),
|
||||
sorted(
|
||||
[
|
||||
(('a', 'a'), 1),
|
||||
(('a', 'test'), 1),
|
||||
(('is', 'a'), 1),
|
||||
(('is', 'is'), 1),
|
||||
(('test', 'test'), 1),
|
||||
(('this', 'is'), 1),
|
||||
(('this', 'this'), 1),
|
||||
]
|
||||
),
|
||||
)
|
||||
self.assertEqual(
|
||||
sorted(b.word_fd.items()),
|
||||
sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
|
||||
)
|
||||
self.assertTrue(
|
||||
len(sent) == sum(b.word_fd.values()) == sum(b.ngram_fd.values()) + 1
|
||||
)
|
||||
self.assertTrue(
|
||||
close_enough(
|
||||
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
|
||||
sorted(
|
||||
[
|
||||
(('a', 'a'), 1.0),
|
||||
(('a', 'test'), 1.0),
|
||||
(('is', 'a'), 1.0),
|
||||
(('is', 'is'), 1.0),
|
||||
(('test', 'test'), 1.0),
|
||||
(('this', 'is'), 1.0),
|
||||
(('this', 'this'), 1.0),
|
||||
]
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
def test_bigram3(self):
|
||||
sent = 'this this is is a a test test'.split()
|
||||
|
||||
b = BigramCollocationFinder.from_words(sent, window_size=3)
|
||||
self.assertEqual(
|
||||
sorted(b.ngram_fd.items()),
|
||||
sorted(
|
||||
[
|
||||
(('a', 'test'), 3),
|
||||
(('is', 'a'), 3),
|
||||
(('this', 'is'), 3),
|
||||
(('a', 'a'), 1),
|
||||
(('is', 'is'), 1),
|
||||
(('test', 'test'), 1),
|
||||
(('this', 'this'), 1),
|
||||
]
|
||||
),
|
||||
)
|
||||
self.assertEqual(
|
||||
sorted(b.word_fd.items()),
|
||||
sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
|
||||
)
|
||||
self.assertTrue(
|
||||
len(sent)
|
||||
== sum(b.word_fd.values())
|
||||
== (sum(b.ngram_fd.values()) + 2 + 1) / 2.0
|
||||
)
|
||||
self.assertTrue(
|
||||
close_enough(
|
||||
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
|
||||
sorted(
|
||||
[
|
||||
(('a', 'test'), 1.584962500721156),
|
||||
(('is', 'a'), 1.584962500721156),
|
||||
(('this', 'is'), 1.584962500721156),
|
||||
(('a', 'a'), 0.0),
|
||||
(('is', 'is'), 0.0),
|
||||
(('test', 'test'), 0.0),
|
||||
(('this', 'this'), 0.0),
|
||||
]
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
def test_bigram5(self):
|
||||
sent = 'this this is is a a test test'.split()
|
||||
|
||||
b = BigramCollocationFinder.from_words(sent, window_size=5)
|
||||
self.assertEqual(
|
||||
sorted(b.ngram_fd.items()),
|
||||
sorted(
|
||||
[
|
||||
(('a', 'test'), 4),
|
||||
(('is', 'a'), 4),
|
||||
(('this', 'is'), 4),
|
||||
(('is', 'test'), 3),
|
||||
(('this', 'a'), 3),
|
||||
(('a', 'a'), 1),
|
||||
(('is', 'is'), 1),
|
||||
(('test', 'test'), 1),
|
||||
(('this', 'this'), 1),
|
||||
]
|
||||
),
|
||||
)
|
||||
self.assertEqual(
|
||||
sorted(b.word_fd.items()),
|
||||
sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
|
||||
)
|
||||
self.assertTrue(
|
||||
len(sent)
|
||||
== sum(b.word_fd.values())
|
||||
== (sum(b.ngram_fd.values()) + 4 + 3 + 2 + 1) / 4.0
|
||||
)
|
||||
self.assertTrue(
|
||||
close_enough(
|
||||
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
|
||||
sorted(
|
||||
[
|
||||
(('a', 'test'), 1.0),
|
||||
(('is', 'a'), 1.0),
|
||||
(('this', 'is'), 1.0),
|
||||
(('is', 'test'), 0.5849625007211562),
|
||||
(('this', 'a'), 0.5849625007211562),
|
||||
(('a', 'a'), -1.0),
|
||||
(('is', 'is'), -1.0),
|
||||
(('test', 'test'), -1.0),
|
||||
(('this', 'this'), -1.0),
|
||||
]
|
||||
),
|
||||
)
|
||||
)
|
||||
@@ -0,0 +1,107 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
import unittest
|
||||
import contextlib
|
||||
import sys
|
||||
|
||||
from nose import with_setup
|
||||
|
||||
from nltk.corpus import gutenberg
|
||||
from nltk.text import Text
|
||||
|
||||
try:
|
||||
from StringIO import StringIO
|
||||
except ImportError as e:
|
||||
from io import StringIO
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def stdout_redirect(where):
|
||||
sys.stdout = where
|
||||
try:
|
||||
yield where
|
||||
finally:
|
||||
sys.stdout = sys.__stdout__
|
||||
|
||||
|
||||
class TestConcordance(unittest.TestCase):
|
||||
"""Text constructed using: http://www.nltk.org/book/ch01.html"""
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
cls.corpus = gutenberg.words('melville-moby_dick.txt')
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls):
|
||||
pass
|
||||
|
||||
def setUp(self):
|
||||
self.text = Text(TestConcordance.corpus)
|
||||
self.query = "monstrous"
|
||||
self.maxDiff = None
|
||||
self.list_out = [
|
||||
'ong the former , one was of a most monstrous size . ... This came towards us , ',
|
||||
'ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r',
|
||||
'll over with a heathenish array of monstrous clubs and spears . Some were thick',
|
||||
'd as you gazed , and wondered what monstrous cannibal and savage could ever hav',
|
||||
'that has survived the flood ; most monstrous and most mountainous ! That Himmal',
|
||||
'they might scout at Moby Dick as a monstrous fable , or still worse and more de',
|
||||
'th of Radney .\'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l',
|
||||
'ing Scenes . In connexion with the monstrous pictures of whales , I am strongly',
|
||||
'ere to enter upon those still more monstrous stories of them which are to be fo',
|
||||
'ght have been rummaged out of this monstrous cabinet there is no telling . But ',
|
||||
'of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u',
|
||||
]
|
||||
|
||||
def tearDown(self):
|
||||
pass
|
||||
|
||||
def test_concordance_list(self):
|
||||
concordance_out = self.text.concordance_list(self.query)
|
||||
self.assertEqual(self.list_out, [c.line for c in concordance_out])
|
||||
|
||||
def test_concordance_width(self):
|
||||
list_out = [
|
||||
"monstrous",
|
||||
"monstrous",
|
||||
"monstrous",
|
||||
"monstrous",
|
||||
"monstrous",
|
||||
"monstrous",
|
||||
"Monstrous",
|
||||
"monstrous",
|
||||
"monstrous",
|
||||
"monstrous",
|
||||
"monstrous",
|
||||
]
|
||||
|
||||
concordance_out = self.text.concordance_list(self.query, width=0)
|
||||
self.assertEqual(list_out, [c.query for c in concordance_out])
|
||||
|
||||
def test_concordance_lines(self):
|
||||
concordance_out = self.text.concordance_list(self.query, lines=3)
|
||||
self.assertEqual(self.list_out[:3], [c.line for c in concordance_out])
|
||||
|
||||
def test_concordance_print(self):
|
||||
print_out = """Displaying 11 of 11 matches:
|
||||
ong the former , one was of a most monstrous size . ... This came towards us ,
|
||||
ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r
|
||||
ll over with a heathenish array of monstrous clubs and spears . Some were thick
|
||||
d as you gazed , and wondered what monstrous cannibal and savage could ever hav
|
||||
that has survived the flood ; most monstrous and most mountainous ! That Himmal
|
||||
they might scout at Moby Dick as a monstrous fable , or still worse and more de
|
||||
th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l
|
||||
ing Scenes . In connexion with the monstrous pictures of whales , I am strongly
|
||||
ere to enter upon those still more monstrous stories of them which are to be fo
|
||||
ght have been rummaged out of this monstrous cabinet there is no telling . But
|
||||
of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u
|
||||
"""
|
||||
|
||||
with stdout_redirect(StringIO()) as stdout:
|
||||
self.text.concordance(self.query)
|
||||
|
||||
def strip_space(raw_str):
|
||||
return raw_str.replace(" ", "")
|
||||
|
||||
self.assertEqual(strip_space(print_out), strip_space(stdout.getvalue()))
|
||||
1419
venv/lib/python3.7/site-packages/nltk/test/unit/test_corenlp.py
Normal file
1419
venv/lib/python3.7/site-packages/nltk/test/unit/test_corenlp.py
Normal file
File diff suppressed because it is too large
Load Diff
272
venv/lib/python3.7/site-packages/nltk/test/unit/test_corpora.py
Normal file
272
venv/lib/python3.7/site-packages/nltk/test/unit/test_corpora.py
Normal file
@@ -0,0 +1,272 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
import unittest
|
||||
|
||||
from nltk.corpus import (
|
||||
sinica_treebank,
|
||||
conll2007,
|
||||
indian,
|
||||
cess_cat,
|
||||
cess_esp,
|
||||
floresta,
|
||||
ptb,
|
||||
udhr,
|
||||
) # mwa_ppdb
|
||||
|
||||
from nltk.compat import python_2_unicode_compatible
|
||||
from nltk.tree import Tree
|
||||
from nltk.test.unit.utils import skipIf
|
||||
|
||||
|
||||
class TestUdhr(unittest.TestCase):
|
||||
def test_words(self):
|
||||
for name in udhr.fileids():
|
||||
try:
|
||||
words = list(udhr.words(name))
|
||||
except AssertionError:
|
||||
print(name)
|
||||
raise
|
||||
self.assertTrue(words)
|
||||
|
||||
def test_raw_unicode(self):
|
||||
for name in udhr.fileids():
|
||||
txt = udhr.raw(name)
|
||||
assert not isinstance(txt, bytes), name
|
||||
|
||||
|
||||
class TestIndian(unittest.TestCase):
|
||||
def test_words(self):
|
||||
words = indian.words()[:3]
|
||||
self.assertEqual(words, ['মহিষের', 'সন্তান', ':'])
|
||||
|
||||
def test_tagged_words(self):
|
||||
tagged_words = indian.tagged_words()[:3]
|
||||
self.assertEqual(
|
||||
tagged_words, [('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM')]
|
||||
)
|
||||
|
||||
|
||||
class TestCess(unittest.TestCase):
|
||||
def test_catalan(self):
|
||||
words = cess_cat.words()[:15]
|
||||
txt = "El Tribunal_Suprem -Fpa- TS -Fpt- ha confirmat la condemna a quatre anys d' inhabilitació especial"
|
||||
self.assertEqual(words, txt.split())
|
||||
self.assertEqual(cess_cat.tagged_sents()[0][34][0], "càrrecs")
|
||||
|
||||
def test_esp(self):
|
||||
words = cess_esp.words()[:15]
|
||||
txt = "El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del"
|
||||
self.assertEqual(words, txt.split())
|
||||
self.assertEqual(cess_esp.words()[115], "años")
|
||||
|
||||
|
||||
class TestFloresta(unittest.TestCase):
|
||||
def test_words(self):
|
||||
words = floresta.words()[:10]
|
||||
txt = "Um revivalismo refrescante O 7_e_Meio é um ex-libris de a"
|
||||
self.assertEqual(words, txt.split())
|
||||
|
||||
|
||||
class TestSinicaTreebank(unittest.TestCase):
|
||||
def test_sents(self):
|
||||
first_3_sents = sinica_treebank.sents()[:3]
|
||||
self.assertEqual(
|
||||
first_3_sents, [['一'], ['友情'], ['嘉珍', '和', '我', '住在', '同一條', '巷子']]
|
||||
)
|
||||
|
||||
def test_parsed_sents(self):
|
||||
parsed_sents = sinica_treebank.parsed_sents()[25]
|
||||
self.assertEqual(
|
||||
parsed_sents,
|
||||
Tree(
|
||||
'S',
|
||||
[
|
||||
Tree('NP', [Tree('Nba', ['嘉珍'])]),
|
||||
Tree('V‧地', [Tree('VA11', ['不停']), Tree('DE', ['的'])]),
|
||||
Tree('VA4', ['哭泣']),
|
||||
],
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class TestCoNLL2007(unittest.TestCase):
|
||||
# Reading the CoNLL 2007 Dependency Treebanks
|
||||
|
||||
def test_sents(self):
|
||||
sents = conll2007.sents('esp.train')[0]
|
||||
self.assertEqual(
|
||||
sents[:6], ['El', 'aumento', 'del', 'índice', 'de', 'desempleo']
|
||||
)
|
||||
|
||||
def test_parsed_sents(self):
|
||||
|
||||
parsed_sents = conll2007.parsed_sents('esp.train')[0]
|
||||
|
||||
self.assertEqual(
|
||||
parsed_sents.tree(),
|
||||
Tree(
|
||||
'fortaleció',
|
||||
[
|
||||
Tree(
|
||||
'aumento',
|
||||
[
|
||||
'El',
|
||||
Tree(
|
||||
'del',
|
||||
[
|
||||
Tree(
|
||||
'índice',
|
||||
[
|
||||
Tree(
|
||||
'de',
|
||||
[Tree('desempleo', ['estadounidense'])],
|
||||
)
|
||||
],
|
||||
)
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
'hoy',
|
||||
'considerablemente',
|
||||
Tree(
|
||||
'al',
|
||||
[
|
||||
Tree(
|
||||
'euro',
|
||||
[
|
||||
Tree(
|
||||
'cotizaba',
|
||||
[
|
||||
',',
|
||||
'que',
|
||||
Tree('a', [Tree('15.35', ['las', 'GMT'])]),
|
||||
'se',
|
||||
Tree(
|
||||
'en',
|
||||
[
|
||||
Tree(
|
||||
'mercado',
|
||||
[
|
||||
'el',
|
||||
Tree('de', ['divisas']),
|
||||
Tree('de', ['Fráncfort']),
|
||||
],
|
||||
)
|
||||
],
|
||||
),
|
||||
Tree('a', ['0,9452_dólares']),
|
||||
Tree(
|
||||
'frente_a',
|
||||
[
|
||||
',',
|
||||
Tree(
|
||||
'0,9349_dólares',
|
||||
[
|
||||
'los',
|
||||
Tree(
|
||||
'de',
|
||||
[
|
||||
Tree(
|
||||
'mañana',
|
||||
['esta'],
|
||||
)
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
],
|
||||
)
|
||||
],
|
||||
),
|
||||
'.',
|
||||
],
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@skipIf(not ptb.fileids(), "A full installation of the Penn Treebank is not available")
|
||||
class TestPTB(unittest.TestCase):
|
||||
def test_fileids(self):
|
||||
self.assertEqual(
|
||||
ptb.fileids()[:4],
|
||||
[
|
||||
'BROWN/CF/CF01.MRG',
|
||||
'BROWN/CF/CF02.MRG',
|
||||
'BROWN/CF/CF03.MRG',
|
||||
'BROWN/CF/CF04.MRG',
|
||||
],
|
||||
)
|
||||
|
||||
def test_words(self):
|
||||
self.assertEqual(
|
||||
ptb.words('WSJ/00/WSJ_0003.MRG')[:7],
|
||||
['A', 'form', 'of', 'asbestos', 'once', 'used', '*'],
|
||||
)
|
||||
|
||||
def test_tagged_words(self):
|
||||
self.assertEqual(
|
||||
ptb.tagged_words('WSJ/00/WSJ_0003.MRG')[:3],
|
||||
[('A', 'DT'), ('form', 'NN'), ('of', 'IN')],
|
||||
)
|
||||
|
||||
def test_categories(self):
|
||||
self.assertEqual(
|
||||
ptb.categories(),
|
||||
[
|
||||
'adventure',
|
||||
'belles_lettres',
|
||||
'fiction',
|
||||
'humor',
|
||||
'lore',
|
||||
'mystery',
|
||||
'news',
|
||||
'romance',
|
||||
'science_fiction',
|
||||
],
|
||||
)
|
||||
|
||||
def test_news_fileids(self):
|
||||
self.assertEqual(
|
||||
ptb.fileids('news')[:3],
|
||||
['WSJ/00/WSJ_0001.MRG', 'WSJ/00/WSJ_0002.MRG', 'WSJ/00/WSJ_0003.MRG'],
|
||||
)
|
||||
|
||||
def test_category_words(self):
|
||||
self.assertEqual(
|
||||
ptb.words(categories=['humor', 'fiction'])[:6],
|
||||
['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back'],
|
||||
)
|
||||
|
||||
|
||||
@unittest.skip("Skipping test for mwa_ppdb.")
|
||||
class TestMWAPPDB(unittest.TestCase):
|
||||
def test_fileids(self):
|
||||
self.assertEqual(
|
||||
mwa_ppdb.fileids(), ['ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs']
|
||||
)
|
||||
|
||||
def test_entries(self):
|
||||
self.assertEqual(
|
||||
mwa_ppdb.entries()[:10],
|
||||
[
|
||||
('10/17/01', '17/10/2001'),
|
||||
('102,70', '102.70'),
|
||||
('13,53', '13.53'),
|
||||
('3.2.5.3.2.1', '3.2.5.3.2.1.'),
|
||||
('53,76', '53.76'),
|
||||
('6.9.5', '6.9.5.'),
|
||||
('7.7.6.3', '7.7.6.3.'),
|
||||
('76,20', '76.20'),
|
||||
('79,85', '79.85'),
|
||||
('93,65', '93.65'),
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
# unload corpora
|
||||
from nltk.corpus import teardown_module
|
||||
@@ -0,0 +1,49 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Corpus View Regression Tests
|
||||
"""
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
import unittest
|
||||
import nltk.data
|
||||
from nltk.corpus.reader.util import (
|
||||
StreamBackedCorpusView,
|
||||
read_whitespace_block,
|
||||
read_line_block,
|
||||
)
|
||||
|
||||
|
||||
class TestCorpusViews(unittest.TestCase):
|
||||
|
||||
linetok = nltk.LineTokenizer(blanklines='keep')
|
||||
names = [
|
||||
'corpora/inaugural/README', # A very short file (160 chars)
|
||||
'corpora/inaugural/1793-Washington.txt', # A relatively short file (791 chars)
|
||||
'corpora/inaugural/1909-Taft.txt', # A longer file (32k chars)
|
||||
]
|
||||
|
||||
def data(self):
|
||||
for name in self.names:
|
||||
f = nltk.data.find(name)
|
||||
with f.open() as fp:
|
||||
file_data = fp.read().decode('utf8')
|
||||
yield f, file_data
|
||||
|
||||
def test_correct_values(self):
|
||||
# Check that corpus views produce the correct sequence of values.
|
||||
|
||||
for f, file_data in self.data():
|
||||
v = StreamBackedCorpusView(f, read_whitespace_block)
|
||||
self.assertEqual(list(v), file_data.split())
|
||||
|
||||
v = StreamBackedCorpusView(f, read_line_block)
|
||||
self.assertEqual(list(v), self.linetok.tokenize(file_data))
|
||||
|
||||
def test_correct_length(self):
|
||||
# Check that the corpus views report the correct lengths:
|
||||
|
||||
for f, file_data in self.data():
|
||||
v = StreamBackedCorpusView(f, read_whitespace_block)
|
||||
self.assertEqual(len(v), len(file_data.split()))
|
||||
|
||||
v = StreamBackedCorpusView(f, read_line_block)
|
||||
self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))
|
||||
22
venv/lib/python3.7/site-packages/nltk/test/unit/test_data.py
Normal file
22
venv/lib/python3.7/site-packages/nltk/test/unit/test_data.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import unittest
|
||||
import nltk.data
|
||||
from nose.tools import assert_raises
|
||||
|
||||
|
||||
class TestData(unittest.TestCase):
|
||||
def test_find_raises_exception(self):
|
||||
|
||||
with assert_raises(LookupError) as context:
|
||||
nltk.data.find('no_such_resource/foo')
|
||||
|
||||
assert type(context.exception) == LookupError, 'Unexpected exception raised'
|
||||
|
||||
def test_find_raises_exception_with_full_resource_name(self):
|
||||
no_such_thing = 'no_such_thing/bar'
|
||||
|
||||
with assert_raises(LookupError) as context:
|
||||
nltk.data.find(no_such_thing)
|
||||
|
||||
assert no_such_thing in str(
|
||||
context.exception
|
||||
), 'Exception message does not include full resource name'
|
||||
@@ -0,0 +1,142 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
import unittest
|
||||
|
||||
from nltk.metrics.agreement import AnnotationTask
|
||||
|
||||
class TestDisagreement(unittest.TestCase):
|
||||
|
||||
'''
|
||||
Class containing unit tests for nltk.metrics.agreement.Disagreement.
|
||||
'''
|
||||
|
||||
def test_easy(self):
|
||||
'''
|
||||
Simple test, based on
|
||||
https://github.com/foolswood/krippendorffs_alpha/raw/master/krippendorff.pdf.
|
||||
'''
|
||||
data = [('coder1', 'dress1', 'YES'),
|
||||
('coder2', 'dress1', 'NO'),
|
||||
('coder3', 'dress1', 'NO'),
|
||||
('coder1', 'dress2', 'YES'),
|
||||
('coder2', 'dress2', 'NO'),
|
||||
('coder3', 'dress3', 'NO'),
|
||||
]
|
||||
annotation_task = AnnotationTask(data)
|
||||
self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
|
||||
|
||||
def test_easy2(self):
|
||||
'''
|
||||
Same simple test with 1 rating removed.
|
||||
Removal of that rating should not matter: K-Apha ignores items with
|
||||
only 1 rating.
|
||||
'''
|
||||
data = [('coder1', 'dress1', 'YES'),
|
||||
('coder2', 'dress1', 'NO'),
|
||||
('coder3', 'dress1', 'NO'),
|
||||
('coder1', 'dress2', 'YES'),
|
||||
('coder2', 'dress2', 'NO'),
|
||||
]
|
||||
annotation_task = AnnotationTask(data)
|
||||
self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
|
||||
|
||||
def test_advanced(self):
|
||||
'''
|
||||
More advanced test, based on
|
||||
http://www.agreestat.com/research_papers/onkrippendorffalpha.pdf
|
||||
'''
|
||||
data = [('A', '1', '1'),
|
||||
('B', '1', '1'),
|
||||
('D', '1', '1'),
|
||||
('A', '2', '2'),
|
||||
('B', '2', '2'),
|
||||
('C', '2', '3'),
|
||||
('D', '2', '2'),
|
||||
('A', '3', '3'),
|
||||
('B', '3', '3'),
|
||||
('C', '3', '3'),
|
||||
('D', '3', '3'),
|
||||
('A', '4', '3'),
|
||||
('B', '4', '3'),
|
||||
('C', '4', '3'),
|
||||
('D', '4', '3'),
|
||||
('A', '5', '2'),
|
||||
('B', '5', '2'),
|
||||
('C', '5', '2'),
|
||||
('D', '5', '2'),
|
||||
('A', '6', '1'),
|
||||
('B', '6', '2'),
|
||||
('C', '6', '3'),
|
||||
('D', '6', '4'),
|
||||
('A', '7', '4'),
|
||||
('B', '7', '4'),
|
||||
('C', '7', '4'),
|
||||
('D', '7', '4'),
|
||||
('A', '8', '1'),
|
||||
('B', '8', '1'),
|
||||
('C', '8', '2'),
|
||||
('D', '8', '1'),
|
||||
('A', '9', '2'),
|
||||
('B', '9', '2'),
|
||||
('C', '9', '2'),
|
||||
('D', '9', '2'),
|
||||
('B', '10', '5'),
|
||||
('C', '10', '5'),
|
||||
('D', '10', '5'),
|
||||
('C', '11', '1'),
|
||||
('D', '11', '1'),
|
||||
('C', '12', '3'),
|
||||
]
|
||||
annotation_task = AnnotationTask(data)
|
||||
self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
|
||||
|
||||
def test_advanced2(self):
|
||||
'''
|
||||
Same more advanced example, but with 1 rating removed.
|
||||
Again, removal of that 1 rating shoudl not matter.
|
||||
'''
|
||||
data = [('A', '1', '1'),
|
||||
('B', '1', '1'),
|
||||
('D', '1', '1'),
|
||||
('A', '2', '2'),
|
||||
('B', '2', '2'),
|
||||
('C', '2', '3'),
|
||||
('D', '2', '2'),
|
||||
('A', '3', '3'),
|
||||
('B', '3', '3'),
|
||||
('C', '3', '3'),
|
||||
('D', '3', '3'),
|
||||
('A', '4', '3'),
|
||||
('B', '4', '3'),
|
||||
('C', '4', '3'),
|
||||
('D', '4', '3'),
|
||||
('A', '5', '2'),
|
||||
('B', '5', '2'),
|
||||
('C', '5', '2'),
|
||||
('D', '5', '2'),
|
||||
('A', '6', '1'),
|
||||
('B', '6', '2'),
|
||||
('C', '6', '3'),
|
||||
('D', '6', '4'),
|
||||
('A', '7', '4'),
|
||||
('B', '7', '4'),
|
||||
('C', '7', '4'),
|
||||
('D', '7', '4'),
|
||||
('A', '8', '1'),
|
||||
('B', '8', '1'),
|
||||
('C', '8', '2'),
|
||||
('D', '8', '1'),
|
||||
('A', '9', '2'),
|
||||
('B', '9', '2'),
|
||||
('C', '9', '2'),
|
||||
('D', '9', '2'),
|
||||
('B', '10', '5'),
|
||||
('C', '10', '5'),
|
||||
('D', '10', '5'),
|
||||
('C', '11', '1'),
|
||||
('D', '11', '1'),
|
||||
('C', '12', '3'),
|
||||
]
|
||||
annotation_task = AnnotationTask(data)
|
||||
self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
|
||||
|
||||
87
venv/lib/python3.7/site-packages/nltk/test/unit/test_hmm.py
Normal file
87
venv/lib/python3.7/site-packages/nltk/test/unit/test_hmm.py
Normal file
@@ -0,0 +1,87 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
from nltk.tag import hmm
|
||||
|
||||
|
||||
def _wikipedia_example_hmm():
|
||||
# Example from wikipedia
|
||||
# (http://en.wikipedia.org/wiki/Forward%E2%80%93backward_algorithm)
|
||||
|
||||
states = ['rain', 'no rain']
|
||||
symbols = ['umbrella', 'no umbrella']
|
||||
|
||||
A = [[0.7, 0.3], [0.3, 0.7]] # transition probabilities
|
||||
B = [[0.9, 0.1], [0.2, 0.8]] # emission probabilities
|
||||
pi = [0.5, 0.5] # initial probabilities
|
||||
|
||||
seq = ['umbrella', 'umbrella', 'no umbrella', 'umbrella', 'umbrella']
|
||||
seq = list(zip(seq, [None] * len(seq)))
|
||||
|
||||
model = hmm._create_hmm_tagger(states, symbols, A, B, pi)
|
||||
return model, states, symbols, seq
|
||||
|
||||
|
||||
def test_forward_probability():
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
|
||||
# example from p. 385, Huang et al
|
||||
model, states, symbols = hmm._market_hmm_example()
|
||||
seq = [('up', None), ('up', None)]
|
||||
expected = [[0.35, 0.02, 0.09], [0.1792, 0.0085, 0.0357]]
|
||||
|
||||
fp = 2 ** model._forward_probability(seq)
|
||||
|
||||
assert_array_almost_equal(fp, expected)
|
||||
|
||||
|
||||
def test_forward_probability2():
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
|
||||
model, states, symbols, seq = _wikipedia_example_hmm()
|
||||
fp = 2 ** model._forward_probability(seq)
|
||||
|
||||
# examples in wikipedia are normalized
|
||||
fp = (fp.T / fp.sum(axis=1)).T
|
||||
|
||||
wikipedia_results = [
|
||||
[0.8182, 0.1818],
|
||||
[0.8834, 0.1166],
|
||||
[0.1907, 0.8093],
|
||||
[0.7308, 0.2692],
|
||||
[0.8673, 0.1327],
|
||||
]
|
||||
|
||||
assert_array_almost_equal(wikipedia_results, fp, 4)
|
||||
|
||||
|
||||
def test_backward_probability():
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
|
||||
model, states, symbols, seq = _wikipedia_example_hmm()
|
||||
|
||||
bp = 2 ** model._backward_probability(seq)
|
||||
# examples in wikipedia are normalized
|
||||
|
||||
bp = (bp.T / bp.sum(axis=1)).T
|
||||
|
||||
wikipedia_results = [
|
||||
# Forward-backward algorithm doesn't need b0_5,
|
||||
# so .backward_probability doesn't compute it.
|
||||
# [0.6469, 0.3531],
|
||||
[0.5923, 0.4077],
|
||||
[0.3763, 0.6237],
|
||||
[0.6533, 0.3467],
|
||||
[0.6273, 0.3727],
|
||||
[0.5, 0.5],
|
||||
]
|
||||
|
||||
assert_array_almost_equal(wikipedia_results, bp, 4)
|
||||
|
||||
|
||||
def setup_module(module):
|
||||
from nose import SkipTest
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
raise SkipTest("numpy is required for nltk.test.test_hmm")
|
||||
@@ -0,0 +1,237 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Natural Language Toolkit: Twitter client
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Lorenzo Rubio <lrnzcig@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Regression tests for `json2csv()` and `json2csv_entities()` in Twitter
|
||||
package.
|
||||
|
||||
"""
|
||||
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from six.moves import zip
|
||||
|
||||
from nltk.compat import TemporaryDirectory
|
||||
from nltk.corpus import twitter_samples
|
||||
from nltk.twitter.common import json2csv, json2csv_entities
|
||||
|
||||
|
||||
def are_files_identical(filename1, filename2, debug=False):
|
||||
"""
|
||||
Compare two files, ignoring carriage returns.
|
||||
"""
|
||||
with open(filename1, "rb") as fileA:
|
||||
with open(filename2, "rb") as fileB:
|
||||
result = True
|
||||
for lineA, lineB in zip(
|
||||
sorted(fileA.readlines()), sorted(fileB.readlines())
|
||||
):
|
||||
if lineA.strip() != lineB.strip():
|
||||
if debug:
|
||||
print(
|
||||
"Error while comparing files. "
|
||||
+ "First difference at line below."
|
||||
)
|
||||
print("=> Output file line: {0}".format(lineA))
|
||||
print("=> Refer. file line: {0}".format(lineB))
|
||||
result = False
|
||||
break
|
||||
return result
|
||||
|
||||
|
||||
class TestJSON2CSV(unittest.TestCase):
|
||||
def setUp(self):
|
||||
with open(twitter_samples.abspath("tweets.20150430-223406.json")) as infile:
|
||||
self.infile = [next(infile) for x in range(100)]
|
||||
infile.close()
|
||||
self.msg = "Test and reference files are not the same"
|
||||
self.subdir = os.path.join(os.path.dirname(__file__), 'files')
|
||||
|
||||
def tearDown(self):
|
||||
return
|
||||
|
||||
def test_textoutput(self):
|
||||
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.text.csv.ref')
|
||||
with TemporaryDirectory() as tempdir:
|
||||
outfn = os.path.join(tempdir, 'tweets.20150430-223406.text.csv')
|
||||
json2csv(self.infile, outfn, ['text'], gzip_compress=False)
|
||||
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
|
||||
|
||||
def test_tweet_metadata(self):
|
||||
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.tweet.csv.ref')
|
||||
fields = [
|
||||
'created_at',
|
||||
'favorite_count',
|
||||
'id',
|
||||
'in_reply_to_status_id',
|
||||
'in_reply_to_user_id',
|
||||
'retweet_count',
|
||||
'retweeted',
|
||||
'text',
|
||||
'truncated',
|
||||
'user.id',
|
||||
]
|
||||
|
||||
with TemporaryDirectory() as tempdir:
|
||||
outfn = os.path.join(tempdir, 'tweets.20150430-223406.tweet.csv')
|
||||
json2csv(self.infile, outfn, fields, gzip_compress=False)
|
||||
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
|
||||
|
||||
def test_user_metadata(self):
|
||||
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.user.csv.ref')
|
||||
fields = ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count']
|
||||
|
||||
with TemporaryDirectory() as tempdir:
|
||||
outfn = os.path.join(tempdir, 'tweets.20150430-223406.user.csv')
|
||||
json2csv(self.infile, outfn, fields, gzip_compress=False)
|
||||
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
|
||||
|
||||
def test_tweet_hashtag(self):
|
||||
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.hashtag.csv.ref')
|
||||
with TemporaryDirectory() as tempdir:
|
||||
outfn = os.path.join(tempdir, 'tweets.20150430-223406.hashtag.csv')
|
||||
json2csv_entities(
|
||||
self.infile,
|
||||
outfn,
|
||||
['id', 'text'],
|
||||
'hashtags',
|
||||
['text'],
|
||||
gzip_compress=False,
|
||||
)
|
||||
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
|
||||
|
||||
def test_tweet_usermention(self):
|
||||
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.usermention.csv.ref')
|
||||
with TemporaryDirectory() as tempdir:
|
||||
outfn = os.path.join(tempdir, 'tweets.20150430-223406.usermention.csv')
|
||||
json2csv_entities(
|
||||
self.infile,
|
||||
outfn,
|
||||
['id', 'text'],
|
||||
'user_mentions',
|
||||
['id', 'screen_name'],
|
||||
gzip_compress=False,
|
||||
)
|
||||
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
|
||||
|
||||
def test_tweet_media(self):
|
||||
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.media.csv.ref')
|
||||
with TemporaryDirectory() as tempdir:
|
||||
outfn = os.path.join(tempdir, 'tweets.20150430-223406.media.csv')
|
||||
json2csv_entities(
|
||||
self.infile,
|
||||
outfn,
|
||||
['id'],
|
||||
'media',
|
||||
['media_url', 'url'],
|
||||
gzip_compress=False,
|
||||
)
|
||||
|
||||
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
|
||||
|
||||
def test_tweet_url(self):
|
||||
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.url.csv.ref')
|
||||
with TemporaryDirectory() as tempdir:
|
||||
outfn = os.path.join(tempdir, 'tweets.20150430-223406.url.csv')
|
||||
json2csv_entities(
|
||||
self.infile,
|
||||
outfn,
|
||||
['id'],
|
||||
'urls',
|
||||
['url', 'expanded_url'],
|
||||
gzip_compress=False,
|
||||
)
|
||||
|
||||
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
|
||||
|
||||
def test_userurl(self):
|
||||
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.userurl.csv.ref')
|
||||
with TemporaryDirectory() as tempdir:
|
||||
outfn = os.path.join(tempdir, 'tweets.20150430-223406.userurl.csv')
|
||||
json2csv_entities(
|
||||
self.infile,
|
||||
outfn,
|
||||
['id', 'screen_name'],
|
||||
'user.urls',
|
||||
['url', 'expanded_url'],
|
||||
gzip_compress=False,
|
||||
)
|
||||
|
||||
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
|
||||
|
||||
def test_tweet_place(self):
|
||||
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.place.csv.ref')
|
||||
with TemporaryDirectory() as tempdir:
|
||||
outfn = os.path.join(tempdir, 'tweets.20150430-223406.place.csv')
|
||||
json2csv_entities(
|
||||
self.infile,
|
||||
outfn,
|
||||
['id', 'text'],
|
||||
'place',
|
||||
['name', 'country'],
|
||||
gzip_compress=False,
|
||||
)
|
||||
|
||||
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
|
||||
|
||||
def test_tweet_place_boundingbox(self):
|
||||
ref_fn = os.path.join(
|
||||
self.subdir, 'tweets.20150430-223406.placeboundingbox.csv.ref'
|
||||
)
|
||||
with TemporaryDirectory() as tempdir:
|
||||
outfn = os.path.join(tempdir, 'tweets.20150430-223406.placeboundingbox.csv')
|
||||
json2csv_entities(
|
||||
self.infile,
|
||||
outfn,
|
||||
['id', 'name'],
|
||||
'place.bounding_box',
|
||||
['coordinates'],
|
||||
gzip_compress=False,
|
||||
)
|
||||
|
||||
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
|
||||
|
||||
def test_retweet_original_tweet(self):
|
||||
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.retweet.csv.ref')
|
||||
with TemporaryDirectory() as tempdir:
|
||||
outfn = os.path.join(tempdir, 'tweets.20150430-223406.retweet.csv')
|
||||
json2csv_entities(
|
||||
self.infile,
|
||||
outfn,
|
||||
['id'],
|
||||
'retweeted_status',
|
||||
[
|
||||
'created_at',
|
||||
'favorite_count',
|
||||
'id',
|
||||
'in_reply_to_status_id',
|
||||
'in_reply_to_user_id',
|
||||
'retweet_count',
|
||||
'text',
|
||||
'truncated',
|
||||
'user.id',
|
||||
],
|
||||
gzip_compress=False,
|
||||
)
|
||||
|
||||
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
|
||||
|
||||
def test_file_is_wrong(self):
|
||||
"""
|
||||
Sanity check that file comparison is not giving false positives.
|
||||
"""
|
||||
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.retweet.csv.ref')
|
||||
with TemporaryDirectory() as tempdir:
|
||||
outfn = os.path.join(tempdir, 'tweets.20150430-223406.text.csv')
|
||||
json2csv(self.infile, outfn, ['text'], gzip_compress=False)
|
||||
self.assertFalse(are_files_identical(outfn, ref_fn), msg=self.msg)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -0,0 +1,24 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import print_function, unicode_literals
|
||||
|
||||
|
||||
import unittest
|
||||
from nltk.classify.naivebayes import NaiveBayesClassifier
|
||||
|
||||
|
||||
class NaiveBayesClassifierTest(unittest.TestCase):
|
||||
def test_simple(self):
|
||||
training_features = [
|
||||
({'nice': True, 'good': True}, 'positive'),
|
||||
({'bad': True, 'mean': True}, 'negative'),
|
||||
]
|
||||
|
||||
classifier = NaiveBayesClassifier.train(training_features)
|
||||
|
||||
result = classifier.prob_classify({'nice': True})
|
||||
self.assertTrue(result.prob('positive') > result.prob('negative'))
|
||||
self.assertEqual(result.max(), 'positive')
|
||||
|
||||
result = classifier.prob_classify({'bad': True})
|
||||
self.assertTrue(result.prob('positive') < result.prob('negative'))
|
||||
self.assertEqual(result.max(), 'negative')
|
||||
@@ -0,0 +1,28 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Unit tests for nltk.corpus.nombank
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
import unittest
|
||||
|
||||
from nltk.corpus import nombank
|
||||
# Load the nombank once.
|
||||
nombank.nouns()
|
||||
|
||||
class NombankDemo(unittest.TestCase):
|
||||
def test_numbers(self):
|
||||
# No. of instances.
|
||||
self.assertEqual(len(nombank.instances()), 114574)
|
||||
# No. of rolesets
|
||||
self.assertEqual(len(nombank.rolesets()), 5577)
|
||||
# No. of nouns.
|
||||
self.assertEqual(len(nombank.nouns()), 4704)
|
||||
|
||||
|
||||
def test_instance(self):
|
||||
self.assertEqual(nombank.instances()[0].roleset, 'perc-sign.01')
|
||||
|
||||
def test_framefiles_fileids(self):
|
||||
self.assertEqual(len(nombank.fileids()), 4705)
|
||||
self.assertTrue(all(fileid.endswith('.xml') for fileid in nombank.fileids()))
|
||||
@@ -0,0 +1,85 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for nltk.pos_tag
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import unittest
|
||||
|
||||
from nltk import word_tokenize, pos_tag
|
||||
|
||||
|
||||
class TestPosTag(unittest.TestCase):
|
||||
def test_pos_tag_eng(self):
|
||||
text = "John's big idea isn't all that bad."
|
||||
expected_tagged = [
|
||||
('John', 'NNP'),
|
||||
("'s", 'POS'),
|
||||
('big', 'JJ'),
|
||||
('idea', 'NN'),
|
||||
('is', 'VBZ'),
|
||||
("n't", 'RB'),
|
||||
('all', 'PDT'),
|
||||
('that', 'DT'),
|
||||
('bad', 'JJ'),
|
||||
('.', '.'),
|
||||
]
|
||||
assert pos_tag(word_tokenize(text)) == expected_tagged
|
||||
|
||||
def test_pos_tag_eng_universal(self):
|
||||
text = "John's big idea isn't all that bad."
|
||||
expected_tagged = [
|
||||
('John', 'NOUN'),
|
||||
("'s", 'PRT'),
|
||||
('big', 'ADJ'),
|
||||
('idea', 'NOUN'),
|
||||
('is', 'VERB'),
|
||||
("n't", 'ADV'),
|
||||
('all', 'DET'),
|
||||
('that', 'DET'),
|
||||
('bad', 'ADJ'),
|
||||
('.', '.'),
|
||||
]
|
||||
assert pos_tag(word_tokenize(text), tagset='universal') == expected_tagged
|
||||
|
||||
def test_pos_tag_rus(self):
|
||||
text = u"Илья оторопел и дважды перечитал бумажку."
|
||||
expected_tagged = [
|
||||
('Илья', 'S'),
|
||||
('оторопел', 'V'),
|
||||
('и', 'CONJ'),
|
||||
('дважды', 'ADV'),
|
||||
('перечитал', 'V'),
|
||||
('бумажку', 'S'),
|
||||
('.', 'NONLEX'),
|
||||
]
|
||||
assert pos_tag(word_tokenize(text), lang='rus') == expected_tagged
|
||||
|
||||
def test_pos_tag_rus_universal(self):
|
||||
text = u"Илья оторопел и дважды перечитал бумажку."
|
||||
expected_tagged = [
|
||||
('Илья', 'NOUN'),
|
||||
('оторопел', 'VERB'),
|
||||
('и', 'CONJ'),
|
||||
('дважды', 'ADV'),
|
||||
('перечитал', 'VERB'),
|
||||
('бумажку', 'NOUN'),
|
||||
('.', '.'),
|
||||
]
|
||||
assert (
|
||||
pos_tag(word_tokenize(text), tagset='universal', lang='rus')
|
||||
== expected_tagged
|
||||
)
|
||||
|
||||
def test_pos_tag_unknown_lang(self):
|
||||
text = u"모르겠 습니 다"
|
||||
self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang='kor')
|
||||
# Test for default kwarg, `lang=None`
|
||||
self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang=None)
|
||||
|
||||
def test_unspecified_lang(self):
|
||||
# Tries to force the lang='eng' option.
|
||||
text = u"모르겠 습니 다"
|
||||
expected_but_wrong = [('모르겠', 'JJ'), ('습니', 'NNP'), ('다', 'NN')]
|
||||
assert pos_tag(word_tokenize(text)) == expected_but_wrong
|
||||
@@ -0,0 +1,92 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import print_function, unicode_literals
|
||||
|
||||
import unittest
|
||||
|
||||
from nltk.corpus import rte as rte_corpus
|
||||
from nltk.classify.rte_classify import RTEFeatureExtractor, rte_features, rte_classifier
|
||||
|
||||
expected_from_rte_feature_extration = """
|
||||
alwayson => True
|
||||
ne_hyp_extra => 0
|
||||
ne_overlap => 1
|
||||
neg_hyp => 0
|
||||
neg_txt => 0
|
||||
word_hyp_extra => 3
|
||||
word_overlap => 3
|
||||
|
||||
alwayson => True
|
||||
ne_hyp_extra => 0
|
||||
ne_overlap => 1
|
||||
neg_hyp => 0
|
||||
neg_txt => 0
|
||||
word_hyp_extra => 2
|
||||
word_overlap => 1
|
||||
|
||||
alwayson => True
|
||||
ne_hyp_extra => 1
|
||||
ne_overlap => 1
|
||||
neg_hyp => 0
|
||||
neg_txt => 0
|
||||
word_hyp_extra => 1
|
||||
word_overlap => 2
|
||||
|
||||
alwayson => True
|
||||
ne_hyp_extra => 1
|
||||
ne_overlap => 0
|
||||
neg_hyp => 0
|
||||
neg_txt => 0
|
||||
word_hyp_extra => 6
|
||||
word_overlap => 2
|
||||
|
||||
alwayson => True
|
||||
ne_hyp_extra => 1
|
||||
ne_overlap => 0
|
||||
neg_hyp => 0
|
||||
neg_txt => 0
|
||||
word_hyp_extra => 4
|
||||
word_overlap => 0
|
||||
|
||||
alwayson => True
|
||||
ne_hyp_extra => 1
|
||||
ne_overlap => 0
|
||||
neg_hyp => 0
|
||||
neg_txt => 0
|
||||
word_hyp_extra => 3
|
||||
word_overlap => 1
|
||||
"""
|
||||
|
||||
|
||||
class RTEClassifierTest(unittest.TestCase):
|
||||
# Test the feature extraction method.
|
||||
def test_rte_feature_extraction(self):
|
||||
pairs = rte_corpus.pairs(['rte1_dev.xml'])[:6]
|
||||
test_output = [
|
||||
"%-15s => %s" % (key, rte_features(pair)[key])
|
||||
for pair in pairs
|
||||
for key in sorted(rte_features(pair))
|
||||
]
|
||||
expected_output = expected_from_rte_feature_extration.strip().split('\n')
|
||||
# Remove null strings.
|
||||
expected_output = list(filter(None, expected_output))
|
||||
self.assertEqual(test_output, expected_output)
|
||||
|
||||
# Test the RTEFeatureExtractor object.
|
||||
def test_feature_extractor_object(self):
|
||||
rtepair = rte_corpus.pairs(['rte3_dev.xml'])[33]
|
||||
extractor = RTEFeatureExtractor(rtepair)
|
||||
self.assertEqual(extractor.hyp_words, {'member', 'China', 'SCO.'})
|
||||
self.assertEqual(extractor.overlap('word'), set())
|
||||
self.assertEqual(extractor.overlap('ne'), {'China'})
|
||||
self.assertEqual(extractor.hyp_extra('word'), {'member'})
|
||||
|
||||
# Test the RTE classifier training.
|
||||
def test_rte_classification_without_megam(self):
|
||||
clf = rte_classifier('IIS')
|
||||
clf = rte_classifier('GIS')
|
||||
|
||||
@unittest.skip("Skipping tests with dependencies on MEGAM")
|
||||
def test_rte_classification_with_megam(self):
|
||||
nltk.config_megam('/usr/local/bin/megam')
|
||||
clf = rte_classifier('megam')
|
||||
clf = rte_classifier('BFGS')
|
||||
@@ -0,0 +1,140 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
The following test performs a random series of reads, seeks, and
|
||||
tells, and checks that the results are consistent.
|
||||
"""
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
import random
|
||||
import functools
|
||||
from io import BytesIO
|
||||
from nltk.corpus.reader import SeekableUnicodeStreamReader
|
||||
|
||||
|
||||
def check_reader(unicode_string, encoding, n=1000):
|
||||
bytestr = unicode_string.encode(encoding)
|
||||
strlen = len(unicode_string)
|
||||
stream = BytesIO(bytestr)
|
||||
reader = SeekableUnicodeStreamReader(stream, encoding)
|
||||
# Find all character positions
|
||||
chars = []
|
||||
while True:
|
||||
pos = reader.tell()
|
||||
chars.append((pos, reader.read(1)))
|
||||
if chars[-1][1] == '':
|
||||
break
|
||||
# Find all strings
|
||||
strings = dict((pos, '') for (pos, c) in chars)
|
||||
for pos1, char in chars:
|
||||
for pos2, _ in chars:
|
||||
if pos2 <= pos1:
|
||||
strings[pos2] += char
|
||||
while True:
|
||||
op = random.choice('tsrr')
|
||||
# Check our position?
|
||||
if op == 't': # tell
|
||||
reader.tell()
|
||||
# Perform a seek?
|
||||
if op == 's': # seek
|
||||
new_pos = random.choice([p for (p, c) in chars])
|
||||
reader.seek(new_pos)
|
||||
# Perform a read?
|
||||
if op == 'r': # read
|
||||
if random.random() < 0.3:
|
||||
pos = reader.tell()
|
||||
else:
|
||||
pos = None
|
||||
if random.random() < 0.2:
|
||||
size = None
|
||||
elif random.random() < 0.8:
|
||||
size = random.randint(0, int(strlen / 6))
|
||||
else:
|
||||
size = random.randint(0, strlen + 20)
|
||||
if random.random() < 0.8:
|
||||
s = reader.read(size)
|
||||
else:
|
||||
s = reader.readline(size)
|
||||
# check that everything's consistent
|
||||
if pos is not None:
|
||||
assert pos in strings
|
||||
assert strings[pos].startswith(s)
|
||||
n -= 1
|
||||
if n == 0:
|
||||
return 'passed'
|
||||
|
||||
|
||||
# Call the randomized test function `check_reader` with a variety of
|
||||
# input strings and encodings.
|
||||
|
||||
ENCODINGS = ['ascii', 'latin1', 'greek', 'hebrew', 'utf-16', 'utf-8']
|
||||
|
||||
STRINGS = [
|
||||
"""
|
||||
This is a test file.
|
||||
It is fairly short.
|
||||
""",
|
||||
"This file can be encoded with latin1. \x83",
|
||||
"""\
|
||||
This is a test file.
|
||||
Here's a blank line:
|
||||
|
||||
And here's some unicode: \xee \u0123 \uffe3
|
||||
""",
|
||||
"""\
|
||||
This is a test file.
|
||||
Unicode characters: \xf3 \u2222 \u3333\u4444 \u5555
|
||||
""",
|
||||
]
|
||||
|
||||
|
||||
def test_reader():
|
||||
for string in STRINGS:
|
||||
for encoding in ENCODINGS:
|
||||
try:
|
||||
# skip strings that can't be encoded with the current encoding
|
||||
string.encode(encoding)
|
||||
yield check_reader, string, encoding
|
||||
except UnicodeEncodeError:
|
||||
pass
|
||||
|
||||
|
||||
# nose shows the whole string arguments in a verbose mode; this is annoying,
|
||||
# so large string test is separated.
|
||||
|
||||
LARGE_STRING = (
|
||||
"""\
|
||||
This is a larger file. It has some lines that are longer \
|
||||
than 72 characters. It's got lots of repetition. Here's \
|
||||
some unicode chars: \xee \u0123 \uffe3 \ueeee \u2345
|
||||
|
||||
How fun! Let's repeat it twenty times.
|
||||
"""
|
||||
* 10
|
||||
)
|
||||
|
||||
|
||||
def test_reader_on_large_string():
|
||||
for encoding in ENCODINGS:
|
||||
try:
|
||||
# skip strings that can't be encoded with the current encoding
|
||||
LARGE_STRING.encode(encoding)
|
||||
|
||||
def _check(encoding, n=1000):
|
||||
check_reader(LARGE_STRING, encoding, n)
|
||||
|
||||
yield _check, encoding
|
||||
|
||||
except UnicodeEncodeError:
|
||||
pass
|
||||
|
||||
|
||||
def test_reader_stream_is_closed():
|
||||
reader = SeekableUnicodeStreamReader(BytesIO(b''), 'ascii')
|
||||
assert reader.stream.closed is False
|
||||
reader.__del__()
|
||||
assert reader.stream.closed is True
|
||||
|
||||
|
||||
def teardown_module(module=None):
|
||||
import gc
|
||||
|
||||
gc.collect()
|
||||
116
venv/lib/python3.7/site-packages/nltk/test/unit/test_senna.py
Normal file
116
venv/lib/python3.7/site-packages/nltk/test/unit/test_senna.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Unit tests for Senna
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from os import environ, path, sep
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from nltk.classify import Senna
|
||||
from nltk.tag import SennaTagger, SennaChunkTagger, SennaNERTagger
|
||||
|
||||
# Set Senna executable path for tests if it is not specified as an environment variable
|
||||
if 'SENNA' in environ:
|
||||
SENNA_EXECUTABLE_PATH = path.normpath(environ['SENNA']) + sep
|
||||
else:
|
||||
SENNA_EXECUTABLE_PATH = '/usr/share/senna-v3.0'
|
||||
|
||||
senna_is_installed = path.exists(SENNA_EXECUTABLE_PATH)
|
||||
|
||||
|
||||
@unittest.skipUnless(senna_is_installed, "Requires Senna executable")
|
||||
class TestSennaPipeline(unittest.TestCase):
|
||||
"""Unittest for nltk.classify.senna"""
|
||||
|
||||
def test_senna_pipeline(self):
|
||||
"""Senna pipeline interface"""
|
||||
|
||||
pipeline = Senna(SENNA_EXECUTABLE_PATH, ['pos', 'chk', 'ner'])
|
||||
sent = 'Dusseldorf is an international business center'.split()
|
||||
result = [
|
||||
(token['word'], token['chk'], token['ner'], token['pos'])
|
||||
for token in pipeline.tag(sent)
|
||||
]
|
||||
expected = [
|
||||
('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'),
|
||||
('is', 'B-VP', 'O', 'VBZ'),
|
||||
('an', 'B-NP', 'O', 'DT'),
|
||||
('international', 'I-NP', 'O', 'JJ'),
|
||||
('business', 'I-NP', 'O', 'NN'),
|
||||
('center', 'I-NP', 'O', 'NN'),
|
||||
]
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
|
||||
@unittest.skipUnless(senna_is_installed, "Requires Senna executable")
|
||||
class TestSennaTagger(unittest.TestCase):
|
||||
"""Unittest for nltk.tag.senna"""
|
||||
|
||||
def test_senna_tagger(self):
|
||||
tagger = SennaTagger(SENNA_EXECUTABLE_PATH)
|
||||
result = tagger.tag('What is the airspeed of an unladen swallow ?'.split())
|
||||
expected = [
|
||||
('What', 'WP'),
|
||||
('is', 'VBZ'),
|
||||
('the', 'DT'),
|
||||
('airspeed', 'NN'),
|
||||
('of', 'IN'),
|
||||
('an', 'DT'),
|
||||
('unladen', 'NN'),
|
||||
('swallow', 'NN'),
|
||||
('?', '.'),
|
||||
]
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
def test_senna_chunk_tagger(self):
|
||||
chktagger = SennaChunkTagger(SENNA_EXECUTABLE_PATH)
|
||||
result_1 = chktagger.tag('What is the airspeed of an unladen swallow ?'.split())
|
||||
expected_1 = [
|
||||
('What', 'B-NP'),
|
||||
('is', 'B-VP'),
|
||||
('the', 'B-NP'),
|
||||
('airspeed', 'I-NP'),
|
||||
('of', 'B-PP'),
|
||||
('an', 'B-NP'),
|
||||
('unladen', 'I-NP'),
|
||||
('swallow', 'I-NP'),
|
||||
('?', 'O'),
|
||||
]
|
||||
|
||||
result_2 = list(chktagger.bio_to_chunks(result_1, chunk_type='NP'))
|
||||
expected_2 = [
|
||||
('What', '0'),
|
||||
('the airspeed', '2-3'),
|
||||
('an unladen swallow', '5-6-7'),
|
||||
]
|
||||
self.assertEqual(result_1, expected_1)
|
||||
self.assertEqual(result_2, expected_2)
|
||||
|
||||
def test_senna_ner_tagger(self):
|
||||
nertagger = SennaNERTagger(SENNA_EXECUTABLE_PATH)
|
||||
result_1 = nertagger.tag('Shakespeare theatre was in London .'.split())
|
||||
expected_1 = [
|
||||
('Shakespeare', 'B-PER'),
|
||||
('theatre', 'O'),
|
||||
('was', 'O'),
|
||||
('in', 'O'),
|
||||
('London', 'B-LOC'),
|
||||
('.', 'O'),
|
||||
]
|
||||
|
||||
result_2 = nertagger.tag('UN headquarters are in NY , USA .'.split())
|
||||
expected_2 = [
|
||||
('UN', 'B-ORG'),
|
||||
('headquarters', 'O'),
|
||||
('are', 'O'),
|
||||
('in', 'O'),
|
||||
('NY', 'B-LOC'),
|
||||
(',', 'O'),
|
||||
('USA', 'B-LOC'),
|
||||
('.', 'O'),
|
||||
]
|
||||
self.assertEqual(result_1, expected_1)
|
||||
self.assertEqual(result_2, expected_2)
|
||||
146
venv/lib/python3.7/site-packages/nltk/test/unit/test_stem.py
Normal file
146
venv/lib/python3.7/site-packages/nltk/test/unit/test_stem.py
Normal file
@@ -0,0 +1,146 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import print_function, unicode_literals
|
||||
|
||||
import os
|
||||
import unittest
|
||||
from contextlib import closing
|
||||
|
||||
from nltk import data
|
||||
from nltk.stem.snowball import SnowballStemmer
|
||||
from nltk.stem.porter import PorterStemmer
|
||||
|
||||
|
||||
class SnowballTest(unittest.TestCase):
|
||||
def test_arabic(self):
|
||||
"""
|
||||
this unit testing for test the snowball arabic light stemmer
|
||||
this stemmer deals with prefixes and suffixes
|
||||
"""
|
||||
# Test where the ignore_stopwords=True.
|
||||
ar_stemmer = SnowballStemmer("arabic", True)
|
||||
assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
|
||||
assert ar_stemmer.stem("العربية") == "عرب"
|
||||
assert ar_stemmer.stem("فقالوا") == "قال"
|
||||
assert ar_stemmer.stem("الطالبات") == "طالب"
|
||||
assert ar_stemmer.stem("فالطالبات") == "طالب"
|
||||
assert ar_stemmer.stem("والطالبات") == "طالب"
|
||||
assert ar_stemmer.stem("الطالبون") == "طالب"
|
||||
assert ar_stemmer.stem("اللذان") == "اللذان"
|
||||
assert ar_stemmer.stem("من") == "من"
|
||||
# Test where the ignore_stopwords=False.
|
||||
ar_stemmer = SnowballStemmer("arabic", False)
|
||||
assert ar_stemmer.stem("اللذان") == "اللذ" # this is a stop word
|
||||
assert ar_stemmer.stem("الطالبات") == "طالب"
|
||||
assert ar_stemmer.stem("الكلمات") == "كلم"
|
||||
# test where create the arabic stemmer without given init value to ignore_stopwords
|
||||
ar_stemmer = SnowballStemmer("arabic")
|
||||
assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
|
||||
assert ar_stemmer.stem("العربية") == "عرب"
|
||||
assert ar_stemmer.stem("فقالوا") == "قال"
|
||||
assert ar_stemmer.stem("الطالبات") == "طالب"
|
||||
assert ar_stemmer.stem("الكلمات") == "كلم"
|
||||
|
||||
def test_russian(self):
|
||||
stemmer_russian = SnowballStemmer("russian")
|
||||
assert stemmer_russian.stem("авантненькая") == "авантненьк"
|
||||
|
||||
def test_german(self):
|
||||
stemmer_german = SnowballStemmer("german")
|
||||
stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)
|
||||
|
||||
assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
|
||||
assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'
|
||||
|
||||
assert stemmer_german.stem("keinen") == 'kein'
|
||||
assert stemmer_german2.stem("keinen") == 'keinen'
|
||||
|
||||
def test_spanish(self):
|
||||
stemmer = SnowballStemmer('spanish')
|
||||
|
||||
assert stemmer.stem("Visionado") == 'vision'
|
||||
|
||||
# The word 'algue' was raising an IndexError
|
||||
assert stemmer.stem("algue") == 'algu'
|
||||
|
||||
def test_short_strings_bug(self):
|
||||
stemmer = SnowballStemmer('english')
|
||||
assert stemmer.stem("y's") == 'y'
|
||||
|
||||
|
||||
class PorterTest(unittest.TestCase):
|
||||
def _vocabulary(self):
|
||||
with closing(
|
||||
data.find('stemmers/porter_test/porter_vocabulary.txt').open(
|
||||
encoding='utf-8'
|
||||
)
|
||||
) as fp:
|
||||
return fp.read().splitlines()
|
||||
|
||||
def _test_against_expected_output(self, stemmer_mode, expected_stems):
|
||||
stemmer = PorterStemmer(mode=stemmer_mode)
|
||||
for word, true_stem in zip(self._vocabulary(), expected_stems):
|
||||
our_stem = stemmer.stem(word)
|
||||
assert our_stem == true_stem, (
|
||||
"%s should stem to %s in %s mode but got %s"
|
||||
% (word, true_stem, stemmer_mode, our_stem)
|
||||
)
|
||||
|
||||
def test_vocabulary_martin_mode(self):
|
||||
"""Tests all words from the test vocabulary provided by M Porter
|
||||
|
||||
The sample vocabulary and output were sourced from:
|
||||
http://tartarus.org/martin/PorterStemmer/voc.txt
|
||||
http://tartarus.org/martin/PorterStemmer/output.txt
|
||||
and are linked to from the Porter Stemmer algorithm's homepage
|
||||
at
|
||||
http://tartarus.org/martin/PorterStemmer/
|
||||
"""
|
||||
with closing(
|
||||
data.find('stemmers/porter_test/porter_martin_output.txt').open(
|
||||
encoding='utf-8'
|
||||
)
|
||||
) as fp:
|
||||
self._test_against_expected_output(
|
||||
PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines()
|
||||
)
|
||||
|
||||
def test_vocabulary_nltk_mode(self):
|
||||
with closing(
|
||||
data.find('stemmers/porter_test/porter_nltk_output.txt').open(
|
||||
encoding='utf-8'
|
||||
)
|
||||
) as fp:
|
||||
self._test_against_expected_output(
|
||||
PorterStemmer.NLTK_EXTENSIONS, fp.read().splitlines()
|
||||
)
|
||||
|
||||
def test_vocabulary_original_mode(self):
|
||||
# The list of stems for this test was generated by taking the
|
||||
# Martin-blessed stemmer from
|
||||
# http://tartarus.org/martin/PorterStemmer/c.txt
|
||||
# and removing all the --DEPARTURE-- sections from it and
|
||||
# running it against Martin's test vocabulary.
|
||||
|
||||
with closing(
|
||||
data.find('stemmers/porter_test/porter_original_output.txt').open(
|
||||
encoding='utf-8'
|
||||
)
|
||||
) as fp:
|
||||
self._test_against_expected_output(
|
||||
PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines()
|
||||
)
|
||||
|
||||
self._test_against_expected_output(
|
||||
PorterStemmer.ORIGINAL_ALGORITHM,
|
||||
data.find('stemmers/porter_test/porter_original_output.txt')
|
||||
.open(encoding='utf-8')
|
||||
.read()
|
||||
.splitlines(),
|
||||
)
|
||||
|
||||
def test_oed_bug(self):
|
||||
"""Test for bug https://github.com/nltk/nltk/issues/1581
|
||||
|
||||
Ensures that 'oed' can be stemmed without throwing an error.
|
||||
"""
|
||||
assert PorterStemmer().stem('oed') == 'o'
|
||||
30
venv/lib/python3.7/site-packages/nltk/test/unit/test_tag.py
Normal file
30
venv/lib/python3.7/site-packages/nltk/test/unit/test_tag.py
Normal file
@@ -0,0 +1,30 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
|
||||
def test_basic():
|
||||
from nltk.tag import pos_tag
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
result = pos_tag(word_tokenize("John's big idea isn't all that bad."))
|
||||
assert result == [
|
||||
('John', 'NNP'),
|
||||
("'s", 'POS'),
|
||||
('big', 'JJ'),
|
||||
('idea', 'NN'),
|
||||
('is', 'VBZ'),
|
||||
("n't", 'RB'),
|
||||
('all', 'PDT'),
|
||||
('that', 'DT'),
|
||||
('bad', 'JJ'),
|
||||
('.', '.'),
|
||||
]
|
||||
|
||||
|
||||
def setup_module(module):
|
||||
from nose import SkipTest
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
raise SkipTest("numpy is required for nltk.test.test_tag")
|
||||
790
venv/lib/python3.7/site-packages/nltk/test/unit/test_tgrep.py
Normal file
790
venv/lib/python3.7/site-packages/nltk/test/unit/test_tgrep.py
Normal file
@@ -0,0 +1,790 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Natural Language Toolkit: TGrep search
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Will Roberts <wildwilhelm@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
'''
|
||||
Unit tests for nltk.tgrep.
|
||||
'''
|
||||
|
||||
from __future__ import absolute_import, print_function, unicode_literals
|
||||
|
||||
import unittest
|
||||
|
||||
from six import b
|
||||
|
||||
from nltk.tree import ParentedTree
|
||||
from nltk import tgrep
|
||||
|
||||
|
||||
class TestSequenceFunctions(unittest.TestCase):
|
||||
|
||||
'''
|
||||
Class containing unit tests for nltk.tgrep.
|
||||
'''
|
||||
|
||||
def test_tokenize_simple(self):
|
||||
'''
|
||||
Simple test of tokenization.
|
||||
'''
|
||||
tokens = tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]')
|
||||
self.assertEqual(
|
||||
tokens,
|
||||
[
|
||||
'A',
|
||||
'..',
|
||||
'(',
|
||||
'B',
|
||||
'!',
|
||||
'<',
|
||||
'C',
|
||||
'.',
|
||||
'D',
|
||||
')',
|
||||
'|',
|
||||
'!',
|
||||
'[',
|
||||
'<<',
|
||||
'(',
|
||||
'E',
|
||||
',',
|
||||
'F',
|
||||
')',
|
||||
'$',
|
||||
'G',
|
||||
']',
|
||||
],
|
||||
)
|
||||
|
||||
def test_tokenize_encoding(self):
|
||||
'''
|
||||
Test that tokenization handles bytes and strs the same way.
|
||||
'''
|
||||
self.assertEqual(
|
||||
tgrep.tgrep_tokenize(b('A .. (B !< C . D) | ![<< (E , F) $ G]')),
|
||||
tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]'),
|
||||
)
|
||||
|
||||
def test_tokenize_link_types(self):
|
||||
'''
|
||||
Test tokenization of basic link types.
|
||||
'''
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A<B'), ['A', '<', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A>B'), ['A', '>', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A<3B'), ['A', '<3', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A>3B'), ['A', '>3', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A<,B'), ['A', '<,', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A>,B'), ['A', '>,', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A<-3B'), ['A', '<-3', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A>-3B'), ['A', '>-3', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A<-B'), ['A', '<-', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A>-B'), ['A', '>-', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A<\'B'), ['A', '<\'', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A>\'B'), ['A', '>\'', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A<:B'), ['A', '<:', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A>:B'), ['A', '>:', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A<<B'), ['A', '<<', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A>>B'), ['A', '>>', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A<<,B'), ['A', '<<,', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A>>,B'), ['A', '>>,', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A<<\'B'), ['A', '<<\'', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A>>\'B'), ['A', '>>\'', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A<<:B'), ['A', '<<:', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A>>:B'), ['A', '>>:', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A.B'), ['A', '.', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A,B'), ['A', ',', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A..B'), ['A', '..', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A,,B'), ['A', ',,', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A$B'), ['A', '$', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A$.B'), ['A', '$.', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A$,B'), ['A', '$,', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A$..B'), ['A', '$..', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A$,,B'), ['A', '$,,', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!<B'), ['A', '!', '<', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!>B'), ['A', '!', '>', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!<3B'), ['A', '!', '<3', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!>3B'), ['A', '!', '>3', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!<,B'), ['A', '!', '<,', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!>,B'), ['A', '!', '>,', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!<-3B'), ['A', '!', '<-3', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!>-3B'), ['A', '!', '>-3', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!<-B'), ['A', '!', '<-', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!>-B'), ['A', '!', '>-', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!<\'B'), ['A', '!', '<\'', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!>\'B'), ['A', '!', '>\'', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!<:B'), ['A', '!', '<:', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!>:B'), ['A', '!', '>:', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!<<B'), ['A', '!', '<<', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!>>B'), ['A', '!', '>>', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!<<,B'), ['A', '!', '<<,', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!>>,B'), ['A', '!', '>>,', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!<<\'B'), ['A', '!', '<<\'', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!>>\'B'), ['A', '!', '>>\'', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!<<:B'), ['A', '!', '<<:', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!>>:B'), ['A', '!', '>>:', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!.B'), ['A', '!', '.', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!,B'), ['A', '!', ',', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!..B'), ['A', '!', '..', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!,,B'), ['A', '!', ',,', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!$B'), ['A', '!', '$', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!$.B'), ['A', '!', '$.', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!$,B'), ['A', '!', '$,', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!$..B'), ['A', '!', '$..', 'B'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A!$,,B'), ['A', '!', '$,,', 'B'])
|
||||
|
||||
def test_tokenize_examples(self):
|
||||
'''
|
||||
Test tokenization of the TGrep2 manual example patterns.
|
||||
'''
|
||||
self.assertEqual(tgrep.tgrep_tokenize('NP < PP'), ['NP', '<', 'PP'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('/^NP/'), ['/^NP/'])
|
||||
self.assertEqual(
|
||||
tgrep.tgrep_tokenize('NP << PP . VP'), ['NP', '<<', 'PP', '.', 'VP']
|
||||
)
|
||||
self.assertEqual(
|
||||
tgrep.tgrep_tokenize('NP << PP | . VP'), ['NP', '<<', 'PP', '|', '.', 'VP']
|
||||
)
|
||||
self.assertEqual(
|
||||
tgrep.tgrep_tokenize('NP !<< PP [> NP | >> VP]'),
|
||||
['NP', '!', '<<', 'PP', '[', '>', 'NP', '|', '>>', 'VP', ']'],
|
||||
)
|
||||
self.assertEqual(
|
||||
tgrep.tgrep_tokenize('NP << (PP . VP)'),
|
||||
['NP', '<<', '(', 'PP', '.', 'VP', ')'],
|
||||
)
|
||||
self.assertEqual(
|
||||
tgrep.tgrep_tokenize('NP <\' (PP <, (IN < on))'),
|
||||
['NP', '<\'', '(', 'PP', '<,', '(', 'IN', '<', 'on', ')', ')'],
|
||||
)
|
||||
self.assertEqual(
|
||||
tgrep.tgrep_tokenize('S < (A < B) < C'),
|
||||
['S', '<', '(', 'A', '<', 'B', ')', '<', 'C'],
|
||||
)
|
||||
self.assertEqual(
|
||||
tgrep.tgrep_tokenize('S < ((A < B) < C)'),
|
||||
['S', '<', '(', '(', 'A', '<', 'B', ')', '<', 'C', ')'],
|
||||
)
|
||||
self.assertEqual(
|
||||
tgrep.tgrep_tokenize('S < (A < B < C)'),
|
||||
['S', '<', '(', 'A', '<', 'B', '<', 'C', ')'],
|
||||
)
|
||||
self.assertEqual(tgrep.tgrep_tokenize('A<B&.C'), ['A', '<', 'B', '&', '.', 'C'])
|
||||
|
||||
def test_tokenize_quoting(self):
|
||||
'''
|
||||
Test tokenization of quoting.
|
||||
'''
|
||||
self.assertEqual(
|
||||
tgrep.tgrep_tokenize('"A<<:B"<<:"A $.. B"<"A>3B"<C'),
|
||||
['"A<<:B"', '<<:', '"A $.. B"', '<', '"A>3B"', '<', 'C'],
|
||||
)
|
||||
|
||||
def test_tokenize_nodenames(self):
|
||||
'''
|
||||
Test tokenization of node names.
|
||||
'''
|
||||
self.assertEqual(tgrep.tgrep_tokenize('Robert'), ['Robert'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('/^[Bb]ob/'), ['/^[Bb]ob/'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('*'), ['*'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('__'), ['__'])
|
||||
# test tokenization of NLTK tree position syntax
|
||||
self.assertEqual(tgrep.tgrep_tokenize('N()'), ['N(', ')'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('N(0,)'), ['N(', '0', ',', ')'])
|
||||
self.assertEqual(tgrep.tgrep_tokenize('N(0,0)'), ['N(', '0', ',', '0', ')'])
|
||||
self.assertEqual(
|
||||
tgrep.tgrep_tokenize('N(0,0,)'), ['N(', '0', ',', '0', ',', ')']
|
||||
)
|
||||
|
||||
def test_tokenize_macros(self):
|
||||
'''
|
||||
Test tokenization of macro definitions.
|
||||
'''
|
||||
self.assertEqual(
|
||||
tgrep.tgrep_tokenize(
|
||||
'@ NP /^NP/;\n@ NN /^NN/;\n@NP [!< NP | < @NN] !$.. @NN'
|
||||
),
|
||||
[
|
||||
'@',
|
||||
'NP',
|
||||
'/^NP/',
|
||||
';',
|
||||
'@',
|
||||
'NN',
|
||||
'/^NN/',
|
||||
';',
|
||||
'@NP',
|
||||
'[',
|
||||
'!',
|
||||
'<',
|
||||
'NP',
|
||||
'|',
|
||||
'<',
|
||||
'@NN',
|
||||
']',
|
||||
'!',
|
||||
'$..',
|
||||
'@NN',
|
||||
],
|
||||
)
|
||||
|
||||
def test_node_simple(self):
|
||||
'''
|
||||
Test a simple use of tgrep for finding nodes matching a given
|
||||
pattern.
|
||||
'''
|
||||
tree = ParentedTree.fromstring(
|
||||
'(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))'
|
||||
)
|
||||
self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])), [[(0, 2), (2, 1)]])
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_nodes('NN', [tree])), [[tree[0, 2], tree[2, 1]]]
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('NN|JJ', [tree])), [[(0, 1), (0, 2), (2, 1)]]
|
||||
)
|
||||
|
||||
def test_node_printing(self):
|
||||
'''Test that the tgrep print operator ' is properly ignored.'''
|
||||
tree = ParentedTree.fromstring('(S (n x) (N x))')
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('N', [tree])),
|
||||
list(tgrep.tgrep_positions('\'N', [tree])),
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('/[Nn]/', [tree])),
|
||||
list(tgrep.tgrep_positions('\'/[Nn]/', [tree])),
|
||||
)
|
||||
|
||||
def test_node_encoding(self):
|
||||
'''
|
||||
Test that tgrep search strings handles bytes and strs the same
|
||||
way.
|
||||
'''
|
||||
tree = ParentedTree.fromstring(
|
||||
'(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))'
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions(b('NN'), [tree])),
|
||||
list(tgrep.tgrep_positions('NN', [tree])),
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_nodes(b('NN'), [tree])),
|
||||
list(tgrep.tgrep_nodes('NN', [tree])),
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions(b('NN|JJ'), [tree])),
|
||||
list(tgrep.tgrep_positions('NN|JJ', [tree])),
|
||||
)
|
||||
|
||||
def test_node_nocase(self):
|
||||
'''
|
||||
Test selecting nodes using case insensitive node names.
|
||||
'''
|
||||
tree = ParentedTree.fromstring('(S (n x) (N x))')
|
||||
self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]])
|
||||
|
||||
def test_node_quoted(self):
|
||||
'''
|
||||
Test selecting nodes using quoted node names.
|
||||
'''
|
||||
tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))')
|
||||
self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]])
|
||||
|
||||
def test_node_regex(self):
|
||||
'''
|
||||
Test regex matching on nodes.
|
||||
'''
|
||||
tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))')
|
||||
# This is a regular expression that matches any node whose
|
||||
# name starts with NP, including NP-SBJ:
|
||||
self.assertEqual(list(tgrep.tgrep_positions('/^NP/', [tree])), [[(0,), (1,)]])
|
||||
|
||||
def test_node_regex_2(self):
|
||||
'''
|
||||
Test regex matching on nodes.
|
||||
'''
|
||||
tree = ParentedTree.fromstring('(S (SBJ x) (SBJ1 x) (NP-SBJ x))')
|
||||
self.assertEqual(list(tgrep.tgrep_positions('/^SBJ/', [tree])), [[(0,), (1,)]])
|
||||
# This is a regular expression that matches any node whose
|
||||
# name includes SBJ, including NP-SBJ:
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('/SBJ/', [tree])), [[(0,), (1,), (2,)]]
|
||||
)
|
||||
|
||||
def test_node_tree_position(self):
|
||||
'''
|
||||
Test matching on nodes based on NLTK tree position.
|
||||
'''
|
||||
tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))')
|
||||
# test all tree positions that are not leaves
|
||||
leaf_positions = set(
|
||||
tree.leaf_treeposition(x) for x in range(len(tree.leaves()))
|
||||
)
|
||||
tree_positions = [x for x in tree.treepositions() if x not in leaf_positions]
|
||||
for position in tree_positions:
|
||||
node_id = 'N{0}'.format(position)
|
||||
tgrep_positions = list(tgrep.tgrep_positions(node_id, [tree]))
|
||||
self.assertEqual(len(tgrep_positions[0]), 1)
|
||||
self.assertEqual(tgrep_positions[0][0], position)
|
||||
|
||||
def test_node_noleaves(self):
|
||||
'''
|
||||
Test node name matching with the search_leaves flag set to False.
|
||||
'''
|
||||
tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('x', [tree])), [[(0, 0, 0), (1, 0, 0)]]
|
||||
)
|
||||
self.assertEqual(list(tgrep.tgrep_positions('x', [tree], False)), [[]])
|
||||
|
||||
def tests_rel_dominance(self):
|
||||
'''
|
||||
Test matching nodes based on dominance relations.
|
||||
'''
|
||||
tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* < T', [tree])), [[(0,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* < T > S', [tree])), [[(0,)]])
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* !< T', [tree])),
|
||||
[[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]],
|
||||
)
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* !< T > S', [tree])), [[(1,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* > A', [tree])), [[(0, 0)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* > B', [tree])), [[(1, 0)]])
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* !> B', [tree])),
|
||||
[[(), (0,), (0, 0), (0, 0, 0), (1,), (1, 0, 0)]],
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* !> B >> S', [tree])), [[(0,), (0, 0), (1,)]]
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* >> S', [tree])),
|
||||
[[(0,), (0, 0), (1,), (1, 0)]],
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* >>, S', [tree])), [[(0,), (0, 0)]]
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* >>\' S', [tree])), [[(1,), (1, 0)]]
|
||||
)
|
||||
# Known issue:
|
||||
# self.assertEqual(list(tgrep.tgrep_positions('* !>> S', [tree])),
|
||||
# [[()]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* << T', [tree])), [[(), (0,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* <<\' T', [tree])), [[(0,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* <<1 N', [tree])), [[(1,)]])
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* !<< T', [tree])),
|
||||
[[(0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]],
|
||||
)
|
||||
tree = ParentedTree.fromstring('(S (A (T x)) (B (T x) (N x )))')
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* <: T', [tree])), [[(0,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* < T', [tree])), [[(0,), (1,)]])
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* !<: T', [tree])),
|
||||
[[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0), (1, 1), (1, 1, 0)]],
|
||||
)
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* !<: T > S', [tree])), [[(1,)]])
|
||||
tree = ParentedTree.fromstring('(S (T (A x) (B x)) (T (C x)))')
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* >: T', [tree])), [[(1, 0)]])
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* !>: T', [tree])),
|
||||
[[(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0, 0)]],
|
||||
)
|
||||
tree = ParentedTree.fromstring(
|
||||
'(S (A (B (C (D (E (T x))))))' ' (A (B (C (D (E (T x))) (N x)))))'
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* <<: T', [tree])),
|
||||
[
|
||||
[
|
||||
(0,),
|
||||
(0, 0),
|
||||
(0, 0, 0),
|
||||
(0, 0, 0, 0),
|
||||
(0, 0, 0, 0, 0),
|
||||
(1, 0, 0, 0),
|
||||
(1, 0, 0, 0, 0),
|
||||
]
|
||||
],
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* >>: A', [tree])),
|
||||
[
|
||||
[
|
||||
(0, 0),
|
||||
(0, 0, 0),
|
||||
(0, 0, 0, 0),
|
||||
(0, 0, 0, 0, 0),
|
||||
(0, 0, 0, 0, 0, 0),
|
||||
(1, 0),
|
||||
(1, 0, 0),
|
||||
]
|
||||
],
|
||||
)
|
||||
|
||||
def test_bad_operator(self):
|
||||
'''
|
||||
Test error handling of undefined tgrep operators.
|
||||
'''
|
||||
tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
|
||||
self.assertRaises(
|
||||
tgrep.TgrepException, list, tgrep.tgrep_positions('* >>> S', [tree])
|
||||
)
|
||||
|
||||
def test_comments(self):
|
||||
'''
|
||||
Test that comments are correctly filtered out of tgrep search
|
||||
strings.
|
||||
'''
|
||||
tree = ParentedTree.fromstring('(S (NN x) (NP x) (NN x))')
|
||||
search1 = '''
|
||||
@ NP /^NP/;
|
||||
@ NN /^NN/;
|
||||
@NN
|
||||
'''
|
||||
self.assertEqual(list(tgrep.tgrep_positions(search1, [tree])), [[(0,), (2,)]])
|
||||
search2 = '''
|
||||
# macros
|
||||
@ NP /^NP/;
|
||||
@ NN /^NN/;
|
||||
|
||||
# search string
|
||||
@NN
|
||||
'''
|
||||
self.assertEqual(list(tgrep.tgrep_positions(search2, [tree])), [[(0,), (2,)]])
|
||||
|
||||
def test_rel_sister_nodes(self):
|
||||
'''
|
||||
Test matching sister nodes in a tree.
|
||||
'''
|
||||
tree = ParentedTree.fromstring('(S (A x) (B x) (C x))')
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* $. B', [tree])), [[(0,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* $.. B', [tree])), [[(0,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* $, B', [tree])), [[(2,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* $,, B', [tree])), [[(2,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* $ B', [tree])), [[(0,), (2,)]])
|
||||
|
||||
def tests_rel_indexed_children(self):
|
||||
'''
|
||||
Test matching nodes based on their index in their parent node.
|
||||
'''
|
||||
tree = ParentedTree.fromstring('(S (A x) (B x) (C x))')
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* >, S', [tree])), [[(0,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* >1 S', [tree])), [[(0,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* >2 S', [tree])), [[(1,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* >3 S', [tree])), [[(2,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* >\' S', [tree])), [[(2,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* >-1 S', [tree])), [[(2,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* >-2 S', [tree])), [[(1,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* >-3 S', [tree])), [[(0,)]])
|
||||
tree = ParentedTree.fromstring(
|
||||
'(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) ' '(F (C x) (A x) (B x)))'
|
||||
)
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* <, A', [tree])), [[(0,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* <1 A', [tree])), [[(0,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* <2 A', [tree])), [[(2,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* <3 A', [tree])), [[(1,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* <\' A', [tree])), [[(1,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* <-1 A', [tree])), [[(1,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* <-2 A', [tree])), [[(2,)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('* <-3 A', [tree])), [[(0,)]])
|
||||
|
||||
def test_rel_precedence(self):
|
||||
'''
|
||||
Test matching nodes based on precedence relations.
|
||||
'''
|
||||
tree = ParentedTree.fromstring(
|
||||
'(S (NP (NP (PP x)) (NP (AP x)))'
|
||||
' (VP (AP (X (PP x)) (Y (AP x))))'
|
||||
' (NP (RC (NP (AP x)))))'
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* . X', [tree])), [[(0,), (0, 1), (0, 1, 0)]]
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* . Y', [tree])), [[(1, 0, 0), (1, 0, 0, 0)]]
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* .. X', [tree])),
|
||||
[[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]],
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* .. Y', [tree])),
|
||||
[[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0), (1, 0, 0, 0)]],
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* , X', [tree])), [[(1, 0, 1), (1, 0, 1, 0)]]
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* , Y', [tree])),
|
||||
[[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* ,, X', [tree])),
|
||||
[[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('* ,, Y', [tree])),
|
||||
[[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
|
||||
)
|
||||
|
||||
def test_examples(self):
|
||||
'''
|
||||
Test the Basic Examples from the TGrep2 manual.
|
||||
'''
|
||||
tree = ParentedTree.fromstring('(S (NP (AP x)) (NP (PP x)))')
|
||||
# This matches any NP node that immediately dominates a PP:
|
||||
self.assertEqual(list(tgrep.tgrep_positions('NP < PP', [tree])), [[(1,)]])
|
||||
|
||||
tree = ParentedTree.fromstring('(S (NP x) (VP x) (NP (PP x)) (VP x))')
|
||||
# This matches an NP that dominates a PP and is immediately
|
||||
# followed by a VP:
|
||||
self.assertEqual(list(tgrep.tgrep_positions('NP << PP . VP', [tree])), [[(2,)]])
|
||||
|
||||
tree = ParentedTree.fromstring(
|
||||
'(S (NP (AP x)) (NP (PP x)) ' '(NP (DET x) (NN x)) (VP x))'
|
||||
)
|
||||
# This matches an NP that dominates a PP or is immediately
|
||||
# followed by a VP:
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('NP << PP | . VP', [tree])), [[(1,), (2,)]]
|
||||
)
|
||||
|
||||
tree = ParentedTree.fromstring(
|
||||
'(S (NP (NP (PP x)) (NP (AP x)))'
|
||||
' (VP (AP (NP (PP x)) (NP (AP x))))'
|
||||
' (NP (RC (NP (AP x)))))'
|
||||
)
|
||||
# This matches an NP that does not dominate a PP. Also, the NP
|
||||
# must either have a parent that is an NP or be dominated by a
|
||||
# VP:
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('NP !<< PP [> NP | >> VP]', [tree])),
|
||||
[[(0, 1), (1, 0, 1)]],
|
||||
)
|
||||
|
||||
tree = ParentedTree.fromstring(
|
||||
'(S (NP (AP (PP x) (VP x))) ' '(NP (AP (PP x) (NP x))) (NP x))'
|
||||
)
|
||||
# This matches an NP that dominates a PP which itself is
|
||||
# immediately followed by a VP. Note the use of parentheses to
|
||||
# group ". VP" with the PP rather than with the NP:
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('NP << (PP . VP)', [tree])), [[(0,)]]
|
||||
)
|
||||
|
||||
tree = ParentedTree.fromstring(
|
||||
'(S (NP (DET a) (NN cat) (PP (IN on) (NP x)))'
|
||||
' (NP (DET a) (NN cat) (PP (IN on) (NP x)) (PP x))'
|
||||
' (NP x))'
|
||||
)
|
||||
# This matches an NP whose last child is a PP that begins with
|
||||
# the preposition "on":
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('NP <\' (PP <, (IN < on))', [tree])), [[(0,)]]
|
||||
)
|
||||
|
||||
tree = ParentedTree.fromstring(
|
||||
'(S (S (C x) (A (B x))) (S (C x) (A x)) ' '(S (D x) (A (B x))))'
|
||||
)
|
||||
# The following pattern matches an S which has a child A and
|
||||
# another child that is a C and that the A has a child B:
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('S < (A < B) < C', [tree])), [[(0,)]]
|
||||
)
|
||||
|
||||
tree = ParentedTree.fromstring(
|
||||
'(S (S (A (B x) (C x))) (S (S (C x) (A (B x)))))'
|
||||
)
|
||||
# However, this pattern means that S has child A and that A
|
||||
# has children B and C:
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('S < ((A < B) < C)', [tree])), [[(0,)]]
|
||||
)
|
||||
|
||||
# It is equivalent to this:
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('S < (A < B < C)', [tree])), [[(0,)]]
|
||||
)
|
||||
|
||||
def test_use_macros(self):
|
||||
'''
|
||||
Test defining and using tgrep2 macros.
|
||||
'''
|
||||
tree = ParentedTree.fromstring(
|
||||
'(VP (VB sold) (NP (DET the) '
|
||||
'(NN heiress)) (NP (NN deed) (PREP to) '
|
||||
'(NP (DET the) (NN school) (NN house))))'
|
||||
)
|
||||
self.assertEqual(
|
||||
list(
|
||||
tgrep.tgrep_positions(
|
||||
'@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN', [tree]
|
||||
)
|
||||
),
|
||||
[[(1,), (2, 2)]],
|
||||
)
|
||||
# use undefined macro @CNP
|
||||
self.assertRaises(
|
||||
tgrep.TgrepException,
|
||||
list,
|
||||
tgrep.tgrep_positions(
|
||||
'@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN', [tree]
|
||||
),
|
||||
)
|
||||
|
||||
def test_tokenize_node_labels(self):
|
||||
'''Test tokenization of labeled nodes.'''
|
||||
self.assertEqual(
|
||||
tgrep.tgrep_tokenize('S < @SBJ < (@VP < (@VB $.. @OBJ))'),
|
||||
[
|
||||
'S',
|
||||
'<',
|
||||
'@SBJ',
|
||||
'<',
|
||||
'(',
|
||||
'@VP',
|
||||
'<',
|
||||
'(',
|
||||
'@VB',
|
||||
'$..',
|
||||
'@OBJ',
|
||||
')',
|
||||
')',
|
||||
],
|
||||
)
|
||||
self.assertEqual(
|
||||
tgrep.tgrep_tokenize('S < @SBJ=s < (@VP=v < (@VB $.. @OBJ))'),
|
||||
[
|
||||
'S',
|
||||
'<',
|
||||
'@SBJ',
|
||||
'=',
|
||||
's',
|
||||
'<',
|
||||
'(',
|
||||
'@VP',
|
||||
'=',
|
||||
'v',
|
||||
'<',
|
||||
'(',
|
||||
'@VB',
|
||||
'$..',
|
||||
'@OBJ',
|
||||
')',
|
||||
')',
|
||||
],
|
||||
)
|
||||
|
||||
def test_tokenize_segmented_patterns(self):
|
||||
'''Test tokenization of segmented patterns.'''
|
||||
self.assertEqual(
|
||||
tgrep.tgrep_tokenize('S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'),
|
||||
[
|
||||
'S',
|
||||
'<',
|
||||
'@SBJ',
|
||||
'=',
|
||||
's',
|
||||
'<',
|
||||
'(',
|
||||
'@VP',
|
||||
'=',
|
||||
'v',
|
||||
'<',
|
||||
'(',
|
||||
'@VB',
|
||||
'$..',
|
||||
'@OBJ',
|
||||
')',
|
||||
')',
|
||||
':',
|
||||
'=s',
|
||||
'..',
|
||||
'=v',
|
||||
],
|
||||
)
|
||||
|
||||
def test_labeled_nodes(self):
|
||||
'''
|
||||
Test labeled nodes.
|
||||
|
||||
Test case from Emily M. Bender.
|
||||
'''
|
||||
search = '''
|
||||
# macros
|
||||
@ SBJ /SBJ/;
|
||||
@ VP /VP/;
|
||||
@ VB /VB/;
|
||||
@ VPoB /V[PB]/;
|
||||
@ OBJ /OBJ/;
|
||||
|
||||
# 1 svo
|
||||
S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'''
|
||||
sent1 = ParentedTree.fromstring(
|
||||
'(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))'
|
||||
)
|
||||
sent2 = ParentedTree.fromstring(
|
||||
'(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))'
|
||||
)
|
||||
search_firsthalf = search.split('\n\n')[0] + 'S < @SBJ < (@VP < (@VB $.. @OBJ))'
|
||||
search_rewrite = 'S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))'
|
||||
|
||||
self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0])
|
||||
self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0])
|
||||
self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0])
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions(search, [sent1])),
|
||||
list(tgrep.tgrep_positions(search_rewrite, [sent1])),
|
||||
)
|
||||
self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0])
|
||||
self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0])
|
||||
self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0])
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions(search, [sent2])),
|
||||
list(tgrep.tgrep_positions(search_rewrite, [sent2])),
|
||||
)
|
||||
|
||||
def test_multiple_conjs(self):
|
||||
'''
|
||||
Test that multiple (3 or more) conjunctions of node relations are
|
||||
handled properly.
|
||||
'''
|
||||
sent = ParentedTree.fromstring('((A (B b) (C c)) (A (B b) (C c) (D d)))')
|
||||
# search = '(A < B < C < D)'
|
||||
# search_tworels = '(A < B < C)'
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('(A < B < C < D)', [sent])), [[(1,)]]
|
||||
)
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('(A < B < C)', [sent])), [[(0,), (1,)]]
|
||||
)
|
||||
|
||||
def test_trailing_semicolon(self):
|
||||
'''
|
||||
Test that semicolons at the end of a tgrep2 search string won't
|
||||
cause a parse failure.
|
||||
'''
|
||||
tree = ParentedTree.fromstring(
|
||||
'(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))'
|
||||
)
|
||||
self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])), [[(0, 2), (2, 1)]])
|
||||
self.assertEqual(list(tgrep.tgrep_positions('NN;', [tree])), [[(0, 2), (2, 1)]])
|
||||
self.assertEqual(
|
||||
list(tgrep.tgrep_positions('NN;;', [tree])), [[(0, 2), (2, 1)]]
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
407
venv/lib/python3.7/site-packages/nltk/test/unit/test_tokenize.py
Normal file
407
venv/lib/python3.7/site-packages/nltk/test/unit/test_tokenize.py
Normal file
@@ -0,0 +1,407 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Unit tests for nltk.tokenize.
|
||||
See also nltk/test/tokenize.doctest
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import unittest
|
||||
|
||||
from nose import SkipTest
|
||||
from nose.tools import assert_equal
|
||||
|
||||
from nltk.tokenize import (
|
||||
punkt,
|
||||
word_tokenize,
|
||||
TweetTokenizer,
|
||||
StanfordSegmenter,
|
||||
TreebankWordTokenizer,
|
||||
SyllableTokenizer,
|
||||
)
|
||||
|
||||
|
||||
class TestTokenize(unittest.TestCase):
|
||||
def test_tweet_tokenizer(self):
|
||||
"""
|
||||
Test TweetTokenizer using words with special and accented characters.
|
||||
"""
|
||||
|
||||
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
|
||||
s9 = "@myke: Let's test these words: resumé España München français"
|
||||
tokens = tokenizer.tokenize(s9)
|
||||
expected = [
|
||||
':',
|
||||
"Let's",
|
||||
'test',
|
||||
'these',
|
||||
'words',
|
||||
':',
|
||||
'resumé',
|
||||
'España',
|
||||
'München',
|
||||
'français',
|
||||
]
|
||||
self.assertEqual(tokens, expected)
|
||||
|
||||
def test_sonority_sequencing_syllable_tokenizer(self):
|
||||
"""
|
||||
Test SyllableTokenizer tokenizer.
|
||||
"""
|
||||
tokenizer = SyllableTokenizer()
|
||||
tokens = tokenizer.tokenize('justification')
|
||||
self.assertEqual(tokens, ['jus', 'ti', 'fi', 'ca', 'tion'])
|
||||
|
||||
def test_stanford_segmenter_arabic(self):
|
||||
"""
|
||||
Test the Stanford Word Segmenter for Arabic (default config)
|
||||
"""
|
||||
try:
|
||||
seg = StanfordSegmenter()
|
||||
seg.default_config('ar')
|
||||
sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات'
|
||||
segmented_sent = seg.segment(sent.split())
|
||||
assert segmented_sent.split() == [
|
||||
'يبحث',
|
||||
'علم',
|
||||
'الحاسوب',
|
||||
'استخدام',
|
||||
'الحوسبة',
|
||||
'ب',
|
||||
'جميع',
|
||||
'اشكال',
|
||||
'ها',
|
||||
'ل',
|
||||
'حل',
|
||||
'المشكلات',
|
||||
]
|
||||
except LookupError as e:
|
||||
raise SkipTest(str(e))
|
||||
|
||||
def test_stanford_segmenter_chinese(self):
|
||||
"""
|
||||
Test the Stanford Word Segmenter for Chinese (default config)
|
||||
"""
|
||||
try:
|
||||
seg = StanfordSegmenter()
|
||||
seg.default_config('zh')
|
||||
sent = u"这是斯坦福中文分词器测试"
|
||||
segmented_sent = seg.segment(sent.split())
|
||||
assert segmented_sent.split() == ['这', '是', '斯坦福', '中文', '分词器', '测试']
|
||||
except LookupError as e:
|
||||
raise SkipTest(str(e))
|
||||
|
||||
def test_phone_tokenizer(self):
|
||||
"""
|
||||
Test a string that resembles a phone number but contains a newline
|
||||
"""
|
||||
|
||||
# Should be recognized as a phone number, albeit one with multiple spaces
|
||||
tokenizer = TweetTokenizer()
|
||||
test1 = "(393) 928 -3010"
|
||||
expected = ['(393) 928 -3010']
|
||||
result = tokenizer.tokenize(test1)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
# Due to newline, first three elements aren't part of a phone number;
|
||||
# fourth is
|
||||
test2 = "(393)\n928 -3010"
|
||||
expected = ['(', '393', ')', "928 -3010"]
|
||||
result = tokenizer.tokenize(test2)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
def test_remove_handle(self):
|
||||
"""
|
||||
Test remove_handle() from casual.py with specially crafted edge cases
|
||||
"""
|
||||
|
||||
tokenizer = TweetTokenizer(strip_handles=True)
|
||||
|
||||
# Simple example. Handles with just numbers should be allowed
|
||||
test1 = "@twitter hello @twi_tter_. hi @12345 @123news"
|
||||
expected = ['hello', '.', 'hi']
|
||||
result = tokenizer.tokenize(test1)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
# Handles are allowed to follow any of the following characters
|
||||
test2 = "@n`@n~@n(@n)@n-@n=@n+@n\\@n|@n[@n]@n{@n}@n;@n:@n'@n\"@n/@n?@n.@n,@n<@n>@n @n\n@n ñ@n.ü@n.ç@n."
|
||||
expected = [
|
||||
'`',
|
||||
'~',
|
||||
'(',
|
||||
')',
|
||||
'-',
|
||||
'=',
|
||||
'+',
|
||||
'\\',
|
||||
'|',
|
||||
'[',
|
||||
']',
|
||||
'{',
|
||||
'}',
|
||||
';',
|
||||
':',
|
||||
"'",
|
||||
'"',
|
||||
'/',
|
||||
'?',
|
||||
'.',
|
||||
',',
|
||||
'<',
|
||||
'>',
|
||||
'ñ',
|
||||
'.',
|
||||
'ü',
|
||||
'.',
|
||||
'ç',
|
||||
'.',
|
||||
]
|
||||
result = tokenizer.tokenize(test2)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
# Handles are NOT allowed to follow any of the following characters
|
||||
test3 = "a@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@n"
|
||||
expected = [
|
||||
'a',
|
||||
'@n',
|
||||
'j',
|
||||
'@n',
|
||||
'z',
|
||||
'@n',
|
||||
'A',
|
||||
'@n',
|
||||
'L',
|
||||
'@n',
|
||||
'Z',
|
||||
'@n',
|
||||
'1',
|
||||
'@n',
|
||||
'4',
|
||||
'@n',
|
||||
'7',
|
||||
'@n',
|
||||
'9',
|
||||
'@n',
|
||||
'0',
|
||||
'@n',
|
||||
'_',
|
||||
'@n',
|
||||
'!',
|
||||
'@n',
|
||||
'@',
|
||||
'@n',
|
||||
'#',
|
||||
'@n',
|
||||
'$',
|
||||
'@n',
|
||||
'%',
|
||||
'@n',
|
||||
'&',
|
||||
'@n',
|
||||
'*',
|
||||
'@n',
|
||||
]
|
||||
result = tokenizer.tokenize(test3)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
# Handles are allowed to precede the following characters
|
||||
test4 = "@n!a @n#a @n$a @n%a @n&a @n*a"
|
||||
expected = ['!', 'a', '#', 'a', '$', 'a', '%', 'a', '&', 'a', '*', 'a']
|
||||
result = tokenizer.tokenize(test4)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
# Tests interactions with special symbols and multiple @
|
||||
test5 = "@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@n"
|
||||
expected = [
|
||||
'!',
|
||||
'@n',
|
||||
'#',
|
||||
'@n',
|
||||
'$',
|
||||
'@n',
|
||||
'%',
|
||||
'@n',
|
||||
'&',
|
||||
'@n',
|
||||
'*',
|
||||
'@n',
|
||||
'@n',
|
||||
'@n',
|
||||
'@',
|
||||
'@n',
|
||||
'@n',
|
||||
'@',
|
||||
'@n',
|
||||
'@n_',
|
||||
'@n',
|
||||
'@n7',
|
||||
'@n',
|
||||
'@nj',
|
||||
'@n',
|
||||
]
|
||||
result = tokenizer.tokenize(test5)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
# Tests that handles can have a max length of 20
|
||||
test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmnopqrst1234 @abcdefghijklmnopqrst_ @abcdefghijklmnopqrstendofhandle"
|
||||
expected = ['uvwxyz', '1234', '_', 'endofhandle']
|
||||
result = tokenizer.tokenize(test6)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
# Edge case where an @ comes directly after a long handle
|
||||
test7 = "@abcdefghijklmnopqrstu@abcde @abcdefghijklmnopqrst@abcde @abcdefghijklmnopqrst_@abcde @abcdefghijklmnopqrst5@abcde"
|
||||
expected = [
|
||||
'u',
|
||||
'@abcde',
|
||||
'@abcdefghijklmnopqrst',
|
||||
'@abcde',
|
||||
'_',
|
||||
'@abcde',
|
||||
'5',
|
||||
'@abcde',
|
||||
]
|
||||
result = tokenizer.tokenize(test7)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
def test_treebank_span_tokenizer(self):
|
||||
"""
|
||||
Test TreebankWordTokenizer.span_tokenize function
|
||||
"""
|
||||
|
||||
tokenizer = TreebankWordTokenizer()
|
||||
|
||||
# Test case in the docstring
|
||||
test1 = "Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks)."
|
||||
expected = [
|
||||
(0, 4),
|
||||
(5, 12),
|
||||
(13, 17),
|
||||
(18, 19),
|
||||
(19, 23),
|
||||
(24, 26),
|
||||
(27, 30),
|
||||
(31, 32),
|
||||
(32, 36),
|
||||
(36, 37),
|
||||
(37, 38),
|
||||
(40, 46),
|
||||
(47, 48),
|
||||
(48, 51),
|
||||
(51, 52),
|
||||
(53, 55),
|
||||
(56, 59),
|
||||
(60, 62),
|
||||
(63, 68),
|
||||
(69, 70),
|
||||
(70, 76),
|
||||
(76, 77),
|
||||
(77, 78),
|
||||
]
|
||||
result = list(tokenizer.span_tokenize(test1))
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
# Test case with double quotation
|
||||
test2 = "The DUP is similar to the \"religious right\" in the United States and takes a hardline stance on social issues"
|
||||
expected = [
|
||||
(0, 3),
|
||||
(4, 7),
|
||||
(8, 10),
|
||||
(11, 18),
|
||||
(19, 21),
|
||||
(22, 25),
|
||||
(26, 27),
|
||||
(27, 36),
|
||||
(37, 42),
|
||||
(42, 43),
|
||||
(44, 46),
|
||||
(47, 50),
|
||||
(51, 57),
|
||||
(58, 64),
|
||||
(65, 68),
|
||||
(69, 74),
|
||||
(75, 76),
|
||||
(77, 85),
|
||||
(86, 92),
|
||||
(93, 95),
|
||||
(96, 102),
|
||||
(103, 109),
|
||||
]
|
||||
result = list(tokenizer.span_tokenize(test2))
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
# Test case with double qoutation as well as converted quotations
|
||||
test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues"
|
||||
expected = [
|
||||
(0, 3),
|
||||
(4, 7),
|
||||
(8, 10),
|
||||
(11, 18),
|
||||
(19, 21),
|
||||
(22, 25),
|
||||
(26, 27),
|
||||
(27, 36),
|
||||
(37, 42),
|
||||
(42, 43),
|
||||
(44, 46),
|
||||
(47, 50),
|
||||
(51, 57),
|
||||
(58, 64),
|
||||
(65, 68),
|
||||
(69, 74),
|
||||
(75, 76),
|
||||
(77, 79),
|
||||
(79, 87),
|
||||
(87, 89),
|
||||
(90, 96),
|
||||
(97, 99),
|
||||
(100, 106),
|
||||
(107, 113),
|
||||
]
|
||||
result = list(tokenizer.span_tokenize(test3))
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
def test_word_tokenize(self):
|
||||
"""
|
||||
Test word_tokenize function
|
||||
"""
|
||||
|
||||
sentence = "The 'v', I've been fooled but I'll seek revenge."
|
||||
expected = ['The', "'", 'v', "'", ',', 'I', "'ve", 'been', 'fooled',
|
||||
'but', 'I', "'ll", 'seek', 'revenge', '.']
|
||||
self.assertEqual(word_tokenize(sentence), expected)
|
||||
|
||||
sentence = "'v' 're'"
|
||||
expected = ["'", 'v', "'", "'re", "'"]
|
||||
self.assertEqual(word_tokenize(sentence), expected)
|
||||
|
||||
def test_punkt_pair_iter(self):
|
||||
|
||||
test_cases = [
|
||||
('12', [('1', '2'), ('2', None)]),
|
||||
('123', [('1', '2'), ('2', '3'), ('3', None)]),
|
||||
('1234', [('1', '2'), ('2', '3'), ('3', '4'), ('4', None)]),
|
||||
]
|
||||
|
||||
for (test_input, expected_output) in test_cases:
|
||||
actual_output = [x for x in punkt._pair_iter(test_input)]
|
||||
|
||||
assert_equal(actual_output, expected_output)
|
||||
|
||||
def test_punkt_pair_iter_handles_stop_iteration_exception(self):
|
||||
# test input to trigger StopIteration from next()
|
||||
it = iter([])
|
||||
# call method under test and produce a generator
|
||||
gen = punkt._pair_iter(it)
|
||||
# unpack generator, ensure that no error is raised
|
||||
list(gen)
|
||||
|
||||
def test_punkt_tokenize_words_handles_stop_iteration_exception(self):
|
||||
obj = punkt.PunktBaseClass()
|
||||
|
||||
class TestPunktTokenizeWordsMock:
|
||||
def word_tokenize(self, s):
|
||||
return iter([])
|
||||
|
||||
obj._lang_vars = TestPunktTokenizeWordsMock()
|
||||
# unpack generator, ensure that no error is raised
|
||||
list(obj._tokenize_words('test'))
|
||||
@@ -0,0 +1,181 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for static parts of Twitter package
|
||||
"""
|
||||
|
||||
import os
|
||||
import unittest
|
||||
from nose import SkipTest
|
||||
|
||||
try:
|
||||
import twython
|
||||
except ImportError as e:
|
||||
raise SkipTest("The twython library has not been installed.")
|
||||
|
||||
from nltk.twitter import Authenticate
|
||||
|
||||
|
||||
class TestCredentials(unittest.TestCase):
|
||||
"""
|
||||
Tests that Twitter credentials information from file is handled correctly.
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
self.subdir = os.path.join(os.path.dirname(__file__), 'files')
|
||||
self.auth = Authenticate()
|
||||
os.environ['TWITTER'] = 'twitter-files'
|
||||
|
||||
def test_environment(self):
|
||||
"""
|
||||
Test that environment variable has been read correctly.
|
||||
"""
|
||||
fn = os.path.basename(self.auth.creds_subdir)
|
||||
self.assertEqual(fn, os.environ['TWITTER'])
|
||||
|
||||
def test_empty_subdir1(self):
|
||||
"""
|
||||
Setting subdir to empty path should raise an error.
|
||||
"""
|
||||
try:
|
||||
self.auth.load_creds(subdir='')
|
||||
# raises ValueError (zero length field name in format) for python 2.6
|
||||
# OSError for the rest
|
||||
except OSError:
|
||||
pass
|
||||
except ValueError:
|
||||
pass
|
||||
except Exception as e:
|
||||
self.fail('Unexpected exception thrown: %s' % e)
|
||||
else:
|
||||
self.fail('OSError exception not thrown.')
|
||||
|
||||
def test_empty_subdir2(self):
|
||||
"""
|
||||
Setting subdir to `None` should raise an error.
|
||||
"""
|
||||
self.auth.creds_subdir = None
|
||||
try:
|
||||
self.auth.load_creds()
|
||||
except ValueError:
|
||||
pass
|
||||
except Exception as e:
|
||||
self.fail('Unexpected exception thrown: %s' % e)
|
||||
else:
|
||||
self.fail('ValueError exception not thrown.')
|
||||
|
||||
def test_missingdir(self):
|
||||
"""
|
||||
Setting subdir to nonexistent directory should raise an error.
|
||||
"""
|
||||
try:
|
||||
self.auth.load_creds(subdir='/nosuchdir')
|
||||
# raises ValueError (zero length field name in format) for python 2.6
|
||||
# OSError for the rest
|
||||
except OSError:
|
||||
pass
|
||||
except ValueError:
|
||||
pass
|
||||
except Exception as e:
|
||||
self.fail('Unexpected exception thrown: %s' % e)
|
||||
else:
|
||||
self.fail('OSError exception not thrown.')
|
||||
|
||||
def test_missingfile1(self):
|
||||
"""
|
||||
Defaults for authentication will fail since 'credentials.txt' not
|
||||
present in default subdir, as read from `os.environ['TWITTER']`.
|
||||
"""
|
||||
try:
|
||||
self.auth.load_creds()
|
||||
# raises ValueError (zero length field name in format) for python 2.6
|
||||
# OSError for the rest
|
||||
except OSError:
|
||||
pass
|
||||
except ValueError:
|
||||
pass
|
||||
except Exception as e:
|
||||
self.fail('Unexpected exception thrown: %s' % e)
|
||||
else:
|
||||
self.fail('OSError exception not thrown.')
|
||||
|
||||
def test_missingfile2(self):
|
||||
"""
|
||||
Credentials file 'foobar' cannot be found in default subdir.
|
||||
"""
|
||||
try:
|
||||
self.auth.load_creds(creds_file='foobar')
|
||||
# raises ValueError (zero length field name in format) for python 2.6
|
||||
# OSError for the rest
|
||||
except OSError:
|
||||
pass
|
||||
except ValueError:
|
||||
pass
|
||||
except Exception as e:
|
||||
self.fail('Unexpected exception thrown: %s' % e)
|
||||
else:
|
||||
self.fail('OSError exception not thrown.')
|
||||
|
||||
def test_incomplete_file(self):
|
||||
"""
|
||||
Credentials file 'bad_oauth1-1.txt' is incomplete
|
||||
"""
|
||||
try:
|
||||
self.auth.load_creds(creds_file='bad_oauth1-1.txt', subdir=self.subdir)
|
||||
except ValueError:
|
||||
pass
|
||||
except Exception as e:
|
||||
self.fail('Unexpected exception thrown: %s' % e)
|
||||
else:
|
||||
self.fail('ValueError exception not thrown.')
|
||||
|
||||
def test_malformed_file1(self):
|
||||
"""
|
||||
First key in credentials file 'bad_oauth1-2.txt' is ill-formed
|
||||
"""
|
||||
try:
|
||||
self.auth.load_creds(creds_file='bad_oauth1-2.txt', subdir=self.subdir)
|
||||
except ValueError:
|
||||
pass
|
||||
except Exception as e:
|
||||
self.fail('Unexpected exception thrown: %s' % e)
|
||||
else:
|
||||
self.fail('ValueError exception not thrown.')
|
||||
|
||||
def test_malformed_file2(self):
|
||||
"""
|
||||
First key in credentials file 'bad_oauth1-2.txt' is ill-formed
|
||||
"""
|
||||
try:
|
||||
self.auth.load_creds(creds_file='bad_oauth1-3.txt', subdir=self.subdir)
|
||||
except ValueError:
|
||||
pass
|
||||
except Exception as e:
|
||||
self.fail('Unexpected exception thrown: %s' % e)
|
||||
else:
|
||||
self.fail('ValueError exception not thrown.')
|
||||
|
||||
def test_correct_path(self):
|
||||
"""
|
||||
Path to default credentials file is well-formed, given specified
|
||||
subdir.
|
||||
"""
|
||||
self.auth.load_creds(subdir=self.subdir)
|
||||
self.auth.creds_fullpath = os.path.join(self.subdir, self.auth.creds_file)
|
||||
|
||||
def test_correct_file1(self):
|
||||
"""
|
||||
Default credentials file is identified
|
||||
"""
|
||||
self.auth.load_creds(subdir=self.subdir)
|
||||
self.assertEqual(self.auth.creds_file, 'credentials.txt')
|
||||
|
||||
def test_correct_file2(self):
|
||||
"""
|
||||
Default credentials file has been read correctluy
|
||||
"""
|
||||
oauth = self.auth.load_creds(subdir=self.subdir)
|
||||
self.assertEqual(oauth['app_key'], 'a')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
221
venv/lib/python3.7/site-packages/nltk/test/unit/test_wordnet.py
Normal file
221
venv/lib/python3.7/site-packages/nltk/test/unit/test_wordnet.py
Normal file
@@ -0,0 +1,221 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Unit tests for nltk.corpus.wordnet
|
||||
See also nltk/test/wordnet.doctest
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import collections
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from nose import SkipTest
|
||||
|
||||
from nltk.corpus.reader.wordnet import WordNetCorpusReader
|
||||
from nltk.corpus import wordnet as wn
|
||||
from nltk.corpus import wordnet_ic as wnic
|
||||
from nltk.data import find as find_data
|
||||
|
||||
|
||||
wn.ensure_loaded()
|
||||
S = wn.synset
|
||||
L = wn.lemma
|
||||
|
||||
|
||||
class WordnNetDemo(unittest.TestCase):
|
||||
def test_retrieve_synset(self):
|
||||
move_synset = S('go.v.21')
|
||||
self.assertEqual(move_synset.name(), "move.v.15")
|
||||
self.assertEqual(move_synset.lemma_names(), ['move', 'go'])
|
||||
self.assertEqual(
|
||||
move_synset.definition(), "have a turn; make one's move in a game"
|
||||
)
|
||||
self.assertEqual(move_synset.examples(), ['Can I go now?'])
|
||||
|
||||
def test_retrieve_synsets(self):
|
||||
self.assertEqual(sorted(wn.synsets('zap', pos='n')), [S('zap.n.01')])
|
||||
self.assertEqual(
|
||||
sorted(wn.synsets('zap', pos='v')),
|
||||
[S('microwave.v.01'), S('nuke.v.01'), S('zap.v.01'), S('zap.v.02')],
|
||||
)
|
||||
|
||||
def test_hyperhyponyms(self):
|
||||
# Not every synset as hypernyms()
|
||||
self.assertEqual(S('travel.v.01').hypernyms(), [])
|
||||
self.assertEqual(S('travel.v.02').hypernyms(), [S('travel.v.03')])
|
||||
self.assertEqual(S('travel.v.03').hypernyms(), [])
|
||||
|
||||
# Test hyper-/hyponyms.
|
||||
self.assertEqual(S('breakfast.n.1').hypernyms(), [S('meal.n.01')])
|
||||
first_five_meal_hypo = [
|
||||
S('banquet.n.02'),
|
||||
S('bite.n.04'),
|
||||
S('breakfast.n.01'),
|
||||
S('brunch.n.01'),
|
||||
S('buffet.n.02'),
|
||||
]
|
||||
self.assertEqual(sorted(S('meal.n.1').hyponyms()[:5]), first_five_meal_hypo)
|
||||
self.assertEqual(S('Austen.n.1').instance_hypernyms(), [S('writer.n.01')])
|
||||
first_five_composer_hypo = [
|
||||
S('ambrose.n.01'),
|
||||
S('bach.n.01'),
|
||||
S('barber.n.01'),
|
||||
S('bartok.n.01'),
|
||||
S('beethoven.n.01'),
|
||||
]
|
||||
self.assertEqual(
|
||||
S('composer.n.1').instance_hyponyms()[:5], first_five_composer_hypo
|
||||
)
|
||||
|
||||
# Test root hyper-/hyponyms
|
||||
self.assertEqual(S('person.n.01').root_hypernyms(), [S('entity.n.01')])
|
||||
self.assertEqual(S('sail.v.01').root_hypernyms(), [S('travel.v.01')])
|
||||
self.assertEqual(
|
||||
S('fall.v.12').root_hypernyms(), [S('act.v.01'), S('fall.v.17')]
|
||||
)
|
||||
|
||||
def test_derivationally_related_forms(self):
|
||||
# Test `derivationally_related_forms()`
|
||||
self.assertEqual(
|
||||
L('zap.v.03.nuke').derivationally_related_forms(),
|
||||
[L('atomic_warhead.n.01.nuke')],
|
||||
)
|
||||
self.assertEqual(
|
||||
L('zap.v.03.atomize').derivationally_related_forms(),
|
||||
[L('atomization.n.02.atomization')],
|
||||
)
|
||||
self.assertEqual(
|
||||
L('zap.v.03.atomise').derivationally_related_forms(),
|
||||
[L('atomization.n.02.atomisation')],
|
||||
)
|
||||
self.assertEqual(L('zap.v.03.zap').derivationally_related_forms(), [])
|
||||
|
||||
def test_meronyms_holonyms(self):
|
||||
# Test meronyms, holonyms.
|
||||
self.assertEqual(
|
||||
S('dog.n.01').member_holonyms(), [S('canis.n.01'), S('pack.n.06')]
|
||||
)
|
||||
self.assertEqual(S('dog.n.01').part_meronyms(), [S('flag.n.07')])
|
||||
|
||||
self.assertEqual(S('faculty.n.2').member_meronyms(), [S('professor.n.01')])
|
||||
self.assertEqual(S('copilot.n.1').member_holonyms(), [S('crew.n.01')])
|
||||
|
||||
self.assertEqual(
|
||||
S('table.n.2').part_meronyms(),
|
||||
[S('leg.n.03'), S('tabletop.n.01'), S('tableware.n.01')],
|
||||
)
|
||||
self.assertEqual(S('course.n.7').part_holonyms(), [S('meal.n.01')])
|
||||
|
||||
self.assertEqual(
|
||||
S('water.n.1').substance_meronyms(), [S('hydrogen.n.01'), S('oxygen.n.01')]
|
||||
)
|
||||
self.assertEqual(
|
||||
S('gin.n.1').substance_holonyms(),
|
||||
[
|
||||
S('gin_and_it.n.01'),
|
||||
S('gin_and_tonic.n.01'),
|
||||
S('martini.n.01'),
|
||||
S('pink_lady.n.01'),
|
||||
],
|
||||
)
|
||||
|
||||
def test_antonyms(self):
|
||||
# Test antonyms.
|
||||
self.assertEqual(
|
||||
L('leader.n.1.leader').antonyms(), [L('follower.n.01.follower')]
|
||||
)
|
||||
self.assertEqual(
|
||||
L('increase.v.1.increase').antonyms(), [L('decrease.v.01.decrease')]
|
||||
)
|
||||
|
||||
def test_misc_relations(self):
|
||||
# Test misc relations.
|
||||
self.assertEqual(S('snore.v.1').entailments(), [S('sleep.v.01')])
|
||||
self.assertEqual(
|
||||
S('heavy.a.1').similar_tos(),
|
||||
[
|
||||
S('dense.s.03'),
|
||||
S('doughy.s.01'),
|
||||
S('heavier-than-air.s.01'),
|
||||
S('hefty.s.02'),
|
||||
S('massive.s.04'),
|
||||
S('non-buoyant.s.01'),
|
||||
S('ponderous.s.02'),
|
||||
],
|
||||
)
|
||||
self.assertEqual(S('light.a.1').attributes(), [S('weight.n.01')])
|
||||
self.assertEqual(S('heavy.a.1').attributes(), [S('weight.n.01')])
|
||||
|
||||
# Test pertainyms.
|
||||
self.assertEqual(
|
||||
L('English.a.1.English').pertainyms(), [L('england.n.01.England')]
|
||||
)
|
||||
|
||||
def test_lch(self):
|
||||
# Test LCH.
|
||||
self.assertEqual(
|
||||
S('person.n.01').lowest_common_hypernyms(S('dog.n.01')),
|
||||
[S('organism.n.01')],
|
||||
)
|
||||
self.assertEqual(
|
||||
S('woman.n.01').lowest_common_hypernyms(S('girlfriend.n.02')),
|
||||
[S('woman.n.01')],
|
||||
)
|
||||
|
||||
def test_domains(self):
|
||||
# Test domains.
|
||||
self.assertEqual(S('code.n.03').topic_domains(), [S('computer_science.n.01')])
|
||||
self.assertEqual(S('pukka.a.01').region_domains(), [S('india.n.01')])
|
||||
self.assertEqual(S('freaky.a.01').usage_domains(), [S('slang.n.02')])
|
||||
|
||||
def test_in_topic_domains(self):
|
||||
# Test in domains.
|
||||
self.assertEqual(
|
||||
S('computer_science.n.01').in_topic_domains()[0], S('access.n.05')
|
||||
)
|
||||
self.assertEqual(S('germany.n.01').in_region_domains()[23], S('trillion.n.02'))
|
||||
self.assertEqual(S('slang.n.02').in_usage_domains()[1], S('airhead.n.01'))
|
||||
|
||||
def test_wordnet_similarities(self):
|
||||
# Path based similarities.
|
||||
self.assertAlmostEqual(S('cat.n.01').path_similarity(S('cat.n.01')), 1.0)
|
||||
self.assertAlmostEqual(S('dog.n.01').path_similarity(S('cat.n.01')), 0.2)
|
||||
self.assertAlmostEqual(
|
||||
S('dog.n.01').lch_similarity(S('cat.n.01')), 2.028, places=3
|
||||
)
|
||||
self.assertAlmostEqual(
|
||||
S('dog.n.01').wup_similarity(S('cat.n.01')), 0.8571, places=3
|
||||
)
|
||||
# Information Content similarities.
|
||||
brown_ic = wnic.ic('ic-brown.dat')
|
||||
self.assertAlmostEqual(
|
||||
S('dog.n.01').jcn_similarity(S('cat.n.01'), brown_ic), 0.4497, places=3
|
||||
)
|
||||
semcor_ic = wnic.ic('ic-semcor.dat')
|
||||
self.assertAlmostEqual(
|
||||
S('dog.n.01').lin_similarity(S('cat.n.01'), semcor_ic), 0.8863, places=3
|
||||
)
|
||||
|
||||
def test_omw_lemma_no_trailing_underscore(self):
|
||||
expected = [
|
||||
u'popolna_sprememba_v_mišljenju',
|
||||
u'popoln_obrat',
|
||||
u'preobrat',
|
||||
u'preobrat_v_mišljenju'
|
||||
]
|
||||
self.assertEqual(S('about-face.n.02').lemma_names(lang='slv'), expected)
|
||||
|
||||
def test_iterable_type_for_all_lemma_names(self):
|
||||
# Duck-test for iterables.
|
||||
# See https://stackoverflow.com/a/36230057/610569
|
||||
cat_lemmas = wn.all_lemma_names(lang='cat')
|
||||
eng_lemmas = wn.all_lemma_names(lang='eng')
|
||||
|
||||
self.assertTrue(hasattr(eng_lemmas, '__iter__'))
|
||||
self.assertTrue(hasattr(eng_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
|
||||
self.assertTrue(eng_lemmas.__iter__() is eng_lemmas)
|
||||
|
||||
self.assertTrue(hasattr(cat_lemmas, '__iter__'))
|
||||
self.assertTrue(hasattr(cat_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
|
||||
self.assertTrue(cat_lemmas.__iter__() is cat_lemmas)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,271 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for BLEU translation evaluation metric
|
||||
"""
|
||||
|
||||
import functools
|
||||
import io
|
||||
import unittest
|
||||
|
||||
from nltk.data import find
|
||||
from nltk.translate.bleu_score import (
|
||||
modified_precision,
|
||||
brevity_penalty,
|
||||
closest_ref_length,
|
||||
)
|
||||
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
|
||||
|
||||
|
||||
class TestBLEU(unittest.TestCase):
|
||||
def test_modified_precision(self):
|
||||
"""
|
||||
Examples from the original BLEU paper
|
||||
http://www.aclweb.org/anthology/P02-1040.pdf
|
||||
"""
|
||||
# Example 1: the "the*" example.
|
||||
# Reference sentences.
|
||||
ref1 = 'the cat is on the mat'.split()
|
||||
ref2 = 'there is a cat on the mat'.split()
|
||||
# Hypothesis sentence(s).
|
||||
hyp1 = 'the the the the the the the'.split()
|
||||
|
||||
references = [ref1, ref2]
|
||||
|
||||
# Testing modified unigram precision.
|
||||
hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
|
||||
assert round(hyp1_unigram_precision, 4) == 0.2857
|
||||
# With assertAlmostEqual at 4 place precision.
|
||||
self.assertAlmostEqual(hyp1_unigram_precision, 0.28571428, places=4)
|
||||
|
||||
# Testing modified bigram precision.
|
||||
assert float(modified_precision(references, hyp1, n=2)) == 0.0
|
||||
|
||||
# Example 2: the "of the" example.
|
||||
# Reference sentences
|
||||
ref1 = str(
|
||||
'It is a guide to action that ensures that the military '
|
||||
'will forever heed Party commands'
|
||||
).split()
|
||||
ref2 = str(
|
||||
'It is the guiding principle which guarantees the military '
|
||||
'forces always being under the command of the Party'
|
||||
).split()
|
||||
ref3 = str(
|
||||
'It is the practical guide for the army always to heed '
|
||||
'the directions of the party'
|
||||
).split()
|
||||
# Hypothesis sentence(s).
|
||||
hyp1 = 'of the'.split()
|
||||
|
||||
references = [ref1, ref2, ref3]
|
||||
# Testing modified unigram precision.
|
||||
assert float(modified_precision(references, hyp1, n=1)) == 1.0
|
||||
|
||||
# Testing modified bigram precision.
|
||||
assert float(modified_precision(references, hyp1, n=2)) == 1.0
|
||||
|
||||
# Example 3: Proper MT outputs.
|
||||
hyp1 = str(
|
||||
'It is a guide to action which ensures that the military '
|
||||
'always obeys the commands of the party'
|
||||
).split()
|
||||
hyp2 = str(
|
||||
'It is to insure the troops forever hearing the activity '
|
||||
'guidebook that party direct'
|
||||
).split()
|
||||
|
||||
references = [ref1, ref2, ref3]
|
||||
|
||||
# Unigram precision.
|
||||
hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
|
||||
hyp2_unigram_precision = float(modified_precision(references, hyp2, n=1))
|
||||
# Test unigram precision with assertAlmostEqual at 4 place precision.
|
||||
self.assertAlmostEqual(hyp1_unigram_precision, 0.94444444, places=4)
|
||||
self.assertAlmostEqual(hyp2_unigram_precision, 0.57142857, places=4)
|
||||
# Test unigram precision with rounding.
|
||||
assert round(hyp1_unigram_precision, 4) == 0.9444
|
||||
assert round(hyp2_unigram_precision, 4) == 0.5714
|
||||
|
||||
# Bigram precision
|
||||
hyp1_bigram_precision = float(modified_precision(references, hyp1, n=2))
|
||||
hyp2_bigram_precision = float(modified_precision(references, hyp2, n=2))
|
||||
# Test bigram precision with assertAlmostEqual at 4 place precision.
|
||||
self.assertAlmostEqual(hyp1_bigram_precision, 0.58823529, places=4)
|
||||
self.assertAlmostEqual(hyp2_bigram_precision, 0.07692307, places=4)
|
||||
# Test bigram precision with rounding.
|
||||
assert round(hyp1_bigram_precision, 4) == 0.5882
|
||||
assert round(hyp2_bigram_precision, 4) == 0.0769
|
||||
|
||||
def test_brevity_penalty(self):
|
||||
# Test case from brevity_penalty_closest function in mteval-v13a.pl.
|
||||
# Same test cases as in the doctest in nltk.translate.bleu_score.py
|
||||
references = [['a'] * 11, ['a'] * 8]
|
||||
hypothesis = ['a'] * 7
|
||||
hyp_len = len(hypothesis)
|
||||
closest_ref_len = closest_ref_length(references, hyp_len)
|
||||
self.assertAlmostEqual(
|
||||
brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4
|
||||
)
|
||||
|
||||
references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
|
||||
hypothesis = ['a'] * 7
|
||||
hyp_len = len(hypothesis)
|
||||
closest_ref_len = closest_ref_length(references, hyp_len)
|
||||
assert brevity_penalty(closest_ref_len, hyp_len) == 1.0
|
||||
|
||||
def test_zero_matches(self):
|
||||
# Test case where there's 0 matches
|
||||
references = ['The candidate has no alignment to any of the references'.split()]
|
||||
hypothesis = 'John loves Mary'.split()
|
||||
|
||||
# Test BLEU to nth order of n-grams, where n is len(hypothesis).
|
||||
for n in range(1, len(hypothesis)):
|
||||
weights = [1.0 / n] * n # Uniform weights.
|
||||
assert sentence_bleu(references, hypothesis, weights) == 0
|
||||
|
||||
def test_full_matches(self):
|
||||
# Test case where there's 100% matches
|
||||
references = ['John loves Mary'.split()]
|
||||
hypothesis = 'John loves Mary'.split()
|
||||
|
||||
# Test BLEU to nth order of n-grams, where n is len(hypothesis).
|
||||
for n in range(1, len(hypothesis)):
|
||||
weights = [1.0 / n] * n # Uniform weights.
|
||||
assert sentence_bleu(references, hypothesis, weights) == 1.0
|
||||
|
||||
def test_partial_matches_hypothesis_longer_than_reference(self):
|
||||
references = ['John loves Mary'.split()]
|
||||
hypothesis = 'John loves Mary who loves Mike'.split()
|
||||
# Since no 4-grams matches were found the result should be zero
|
||||
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
|
||||
self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
|
||||
# Checks that the warning has been raised because len(reference) < 4.
|
||||
try:
|
||||
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
|
||||
except AttributeError:
|
||||
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
|
||||
|
||||
|
||||
# @unittest.skip("Skipping fringe cases for BLEU.")
|
||||
class TestBLEUFringeCases(unittest.TestCase):
|
||||
def test_case_where_n_is_bigger_than_hypothesis_length(self):
|
||||
# Test BLEU to nth order of n-grams, where n > len(hypothesis).
|
||||
references = ['John loves Mary ?'.split()]
|
||||
hypothesis = 'John loves Mary'.split()
|
||||
n = len(hypothesis) + 1 #
|
||||
weights = [1.0 / n] * n # Uniform weights.
|
||||
# Since no n-grams matches were found the result should be zero
|
||||
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
|
||||
self.assertAlmostEqual(
|
||||
sentence_bleu(references, hypothesis, weights), 0.0, places=4
|
||||
)
|
||||
# Checks that the warning has been raised because len(hypothesis) < 4.
|
||||
try:
|
||||
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
|
||||
except AttributeError:
|
||||
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
|
||||
|
||||
# Test case where n > len(hypothesis) but so is n > len(reference), and
|
||||
# it's a special case where reference == hypothesis.
|
||||
references = ['John loves Mary'.split()]
|
||||
hypothesis = 'John loves Mary'.split()
|
||||
# Since no 4-grams matches were found the result should be zero
|
||||
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
|
||||
self.assertAlmostEqual(
|
||||
sentence_bleu(references, hypothesis, weights), 0.0, places=4
|
||||
)
|
||||
|
||||
def test_empty_hypothesis(self):
|
||||
# Test case where there's hypothesis is empty.
|
||||
references = ['The candidate has no alignment to any of the references'.split()]
|
||||
hypothesis = []
|
||||
assert sentence_bleu(references, hypothesis) == 0
|
||||
|
||||
def test_empty_references(self):
|
||||
# Test case where there's reference is empty.
|
||||
references = [[]]
|
||||
hypothesis = 'John loves Mary'.split()
|
||||
assert sentence_bleu(references, hypothesis) == 0
|
||||
|
||||
def test_empty_references_and_hypothesis(self):
|
||||
# Test case where both references and hypothesis is empty.
|
||||
references = [[]]
|
||||
hypothesis = []
|
||||
assert sentence_bleu(references, hypothesis) == 0
|
||||
|
||||
def test_reference_or_hypothesis_shorter_than_fourgrams(self):
|
||||
# Tese case where the length of reference or hypothesis
|
||||
# is shorter than 4.
|
||||
references = ['let it go'.split()]
|
||||
hypothesis = 'let go it'.split()
|
||||
# Checks that the value the hypothesis and reference returns is 0.0
|
||||
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
|
||||
self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
|
||||
# Checks that the warning has been raised.
|
||||
try:
|
||||
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
|
||||
except AttributeError:
|
||||
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
|
||||
|
||||
|
||||
class TestBLEUvsMteval13a(unittest.TestCase):
|
||||
def test_corpus_bleu(self):
|
||||
ref_file = find('models/wmt15_eval/ref.ru')
|
||||
hyp_file = find('models/wmt15_eval/google.ru')
|
||||
mteval_output_file = find('models/wmt15_eval/mteval-13a.output')
|
||||
|
||||
# Reads the BLEU scores from the `mteval-13a.output` file.
|
||||
# The order of the list corresponds to the order of the ngrams.
|
||||
with open(mteval_output_file, 'r') as mteval_fin:
|
||||
# The numbers are located in the last 2nd line of the file.
|
||||
# The first and 2nd item in the list are the score and system names.
|
||||
mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])
|
||||
|
||||
with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
|
||||
with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
|
||||
# Whitespace tokenize the file.
|
||||
# Note: split() automatically strip().
|
||||
hypothesis = list(map(lambda x: x.split(), hyp_fin))
|
||||
# Note that the corpus_bleu input is list of list of references.
|
||||
references = list(map(lambda x: [x.split()], ref_fin))
|
||||
# Without smoothing.
|
||||
for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
|
||||
nltk_bleu = corpus_bleu(
|
||||
references, hypothesis, weights=(1.0 / i,) * i
|
||||
)
|
||||
# Check that the BLEU scores difference is less than 0.005 .
|
||||
# Note: This is an approximate comparison; as much as
|
||||
# +/- 0.01 BLEU might be "statistically significant",
|
||||
# the actual translation quality might not be.
|
||||
assert abs(mteval_bleu - nltk_bleu) < 0.005
|
||||
|
||||
# With the same smoothing method used in mteval-v13a.pl
|
||||
chencherry = SmoothingFunction()
|
||||
for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
|
||||
nltk_bleu = corpus_bleu(
|
||||
references,
|
||||
hypothesis,
|
||||
weights=(1.0 / i,) * i,
|
||||
smoothing_function=chencherry.method3,
|
||||
)
|
||||
assert abs(mteval_bleu - nltk_bleu) < 0.005
|
||||
|
||||
|
||||
class TestBLEUWithBadSentence(unittest.TestCase):
|
||||
def test_corpus_bleu_with_bad_sentence(self):
|
||||
hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R"
|
||||
ref = str(
|
||||
"Their tasks include changing a pump on the faulty stokehold ."
|
||||
"Likewise , two species that are very similar in morphology "
|
||||
"were distinguished using genetics ."
|
||||
)
|
||||
references = [[ref.split()]]
|
||||
hypotheses = [hyp.split()]
|
||||
try: # Check that the warning is raised since no. of 2-grams < 0.
|
||||
with self.assertWarns(UserWarning):
|
||||
# Verify that the BLEU output is undesired since no. of 2-grams < 0.
|
||||
self.assertAlmostEqual(
|
||||
corpus_bleu(references, hypotheses), 0.0, places=4
|
||||
)
|
||||
except AttributeError: # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
|
||||
self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4)
|
||||
@@ -0,0 +1,157 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests GDFA alignments
|
||||
"""
|
||||
|
||||
import functools
|
||||
import io
|
||||
import unittest
|
||||
|
||||
from nltk.translate.gdfa import grow_diag_final_and
|
||||
|
||||
|
||||
class TestGDFA(unittest.TestCase):
|
||||
def test_from_eflomal_outputs(self):
|
||||
"""
|
||||
Testing GDFA with first 10 eflomal outputs from issue #1829
|
||||
https://github.com/nltk/nltk/issues/1829
|
||||
"""
|
||||
# Input.
|
||||
forwards = [
|
||||
'0-0 1-2',
|
||||
'0-0 1-1',
|
||||
'0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 7-8 9-9 10-10 9-11 11-12 12-13 13-14',
|
||||
'0-0 1-1 1-2 2-3 3-4 4-5 4-6 5-7 6-8 8-9 9-10',
|
||||
'0-0 14-1 15-2 16-3 20-5 21-6 22-7 5-8 6-9 7-10 8-11 9-12 10-13 11-14 12-15 13-16 14-17 17-18 18-19 19-20 20-21 23-22 24-23 25-24 26-25 27-27 28-28 29-29 30-30 31-31',
|
||||
'0-0 1-1 0-2 2-3',
|
||||
'0-0 2-2 4-4',
|
||||
'0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-20',
|
||||
'3-0 4-1 6-2 5-3 6-4 7-5 8-6 9-7 10-8 11-9 16-10 9-12 10-13 12-14',
|
||||
'1-0',
|
||||
]
|
||||
backwards = [
|
||||
'0-0 1-2',
|
||||
'0-0 1-1',
|
||||
'0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 9-8 10-10 11-12 12-11 13-13',
|
||||
'0-0 1-2 2-3 3-4 4-6 6-8 7-5 8-7 9-8',
|
||||
'0-0 1-8 2-9 3-10 4-11 5-12 6-11 8-13 9-14 10-15 11-16 12-17 13-18 14-19 15-20 16-21 17-22 18-23 19-24 20-29 21-30 22-31 23-2 24-3 25-4 26-5 27-5 28-6 29-7 30-28 31-31',
|
||||
'0-0 1-1 2-3',
|
||||
'0-0 1-1 2-3 4-4',
|
||||
'0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-16 21-18',
|
||||
'0-0 1-1 3-2 4-1 5-3 6-4 7-5 8-6 9-7 10-8 11-9 12-8 13-9 14-8 15-9 16-10',
|
||||
'1-0',
|
||||
]
|
||||
source_lens = [2, 3, 3, 15, 11, 33, 4, 6, 23, 18]
|
||||
target_lens = [2, 4, 3, 16, 12, 33, 5, 6, 22, 16]
|
||||
# Expected Output.
|
||||
expected = [
|
||||
[(0, 0), (1, 2)],
|
||||
[(0, 0), (1, 1)],
|
||||
[
|
||||
(0, 0),
|
||||
(2, 1),
|
||||
(3, 2),
|
||||
(4, 3),
|
||||
(5, 4),
|
||||
(6, 5),
|
||||
(7, 6),
|
||||
(8, 7),
|
||||
(10, 10),
|
||||
(11, 12),
|
||||
],
|
||||
[
|
||||
(0, 0),
|
||||
(1, 1),
|
||||
(1, 2),
|
||||
(2, 3),
|
||||
(3, 4),
|
||||
(4, 5),
|
||||
(4, 6),
|
||||
(5, 7),
|
||||
(6, 8),
|
||||
(7, 5),
|
||||
(8, 7),
|
||||
(8, 9),
|
||||
(9, 8),
|
||||
(9, 10),
|
||||
],
|
||||
[
|
||||
(0, 0),
|
||||
(1, 8),
|
||||
(2, 9),
|
||||
(3, 10),
|
||||
(4, 11),
|
||||
(5, 8),
|
||||
(6, 9),
|
||||
(6, 11),
|
||||
(7, 10),
|
||||
(8, 11),
|
||||
(31, 31),
|
||||
],
|
||||
[(0, 0), (0, 2), (1, 1), (2, 3)],
|
||||
[(0, 0), (1, 1), (2, 2), (2, 3), (4, 4)],
|
||||
[
|
||||
(0, 0),
|
||||
(1, 1),
|
||||
(2, 3),
|
||||
(3, 4),
|
||||
(5, 5),
|
||||
(7, 6),
|
||||
(8, 7),
|
||||
(9, 8),
|
||||
(10, 9),
|
||||
(11, 10),
|
||||
(12, 11),
|
||||
(13, 12),
|
||||
(14, 13),
|
||||
(15, 14),
|
||||
(16, 16),
|
||||
(17, 17),
|
||||
(18, 18),
|
||||
(19, 19),
|
||||
],
|
||||
[
|
||||
(0, 0),
|
||||
(1, 1),
|
||||
(3, 0),
|
||||
(3, 2),
|
||||
(4, 1),
|
||||
(5, 3),
|
||||
(6, 2),
|
||||
(6, 4),
|
||||
(7, 5),
|
||||
(8, 6),
|
||||
(9, 7),
|
||||
(9, 12),
|
||||
(10, 8),
|
||||
(10, 13),
|
||||
(11, 9),
|
||||
(12, 8),
|
||||
(12, 14),
|
||||
(13, 9),
|
||||
(14, 8),
|
||||
(15, 9),
|
||||
(16, 10),
|
||||
],
|
||||
[(1, 0)],
|
||||
[
|
||||
(0, 0),
|
||||
(1, 1),
|
||||
(3, 2),
|
||||
(4, 3),
|
||||
(5, 4),
|
||||
(6, 5),
|
||||
(7, 6),
|
||||
(9, 10),
|
||||
(10, 12),
|
||||
(11, 13),
|
||||
(12, 14),
|
||||
(13, 15),
|
||||
],
|
||||
]
|
||||
|
||||
# Iterate through all 10 examples and check for expected outputs.
|
||||
for fw, bw, src_len, trg_len, expect in zip(
|
||||
forwards, backwards, source_lens, target_lens, expected
|
||||
):
|
||||
self.assertListEqual(expect, grow_diag_final_and(src_len, trg_len, fw, bw))
|
||||
@@ -0,0 +1,76 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for IBM Model 1 training methods
|
||||
"""
|
||||
|
||||
import unittest
|
||||
|
||||
from collections import defaultdict
|
||||
from nltk.translate import AlignedSent
|
||||
from nltk.translate import IBMModel
|
||||
from nltk.translate import IBMModel1
|
||||
from nltk.translate.ibm_model import AlignmentInfo
|
||||
|
||||
|
||||
class TestIBMModel1(unittest.TestCase):
|
||||
def test_set_uniform_translation_probabilities(self):
|
||||
# arrange
|
||||
corpus = [
|
||||
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
|
||||
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
|
||||
]
|
||||
model1 = IBMModel1(corpus, 0)
|
||||
|
||||
# act
|
||||
model1.set_uniform_probabilities(corpus)
|
||||
|
||||
# assert
|
||||
# expected_prob = 1.0 / (target vocab size + 1)
|
||||
self.assertEqual(model1.translation_table['ham']['eier'], 1.0 / 3)
|
||||
self.assertEqual(model1.translation_table['eggs'][None], 1.0 / 3)
|
||||
|
||||
def test_set_uniform_translation_probabilities_of_non_domain_values(self):
|
||||
# arrange
|
||||
corpus = [
|
||||
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
|
||||
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
|
||||
]
|
||||
model1 = IBMModel1(corpus, 0)
|
||||
|
||||
# act
|
||||
model1.set_uniform_probabilities(corpus)
|
||||
|
||||
# assert
|
||||
# examine target words that are not in the training data domain
|
||||
self.assertEqual(model1.translation_table['parrot']['eier'], IBMModel.MIN_PROB)
|
||||
|
||||
def test_prob_t_a_given_s(self):
|
||||
# arrange
|
||||
src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
|
||||
trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
|
||||
corpus = [AlignedSent(trg_sentence, src_sentence)]
|
||||
alignment_info = AlignmentInfo(
|
||||
(0, 1, 4, 0, 2, 5, 5),
|
||||
[None] + src_sentence,
|
||||
['UNUSED'] + trg_sentence,
|
||||
None,
|
||||
)
|
||||
|
||||
translation_table = defaultdict(lambda: defaultdict(float))
|
||||
translation_table['i']['ich'] = 0.98
|
||||
translation_table['love']['gern'] = 0.98
|
||||
translation_table['to'][None] = 0.98
|
||||
translation_table['eat']['esse'] = 0.98
|
||||
translation_table['smoked']['räucherschinken'] = 0.98
|
||||
translation_table['ham']['räucherschinken'] = 0.98
|
||||
|
||||
model1 = IBMModel1(corpus, 0)
|
||||
model1.translation_table = translation_table
|
||||
|
||||
# act
|
||||
probability = model1.prob_t_a_given_s(alignment_info)
|
||||
|
||||
# assert
|
||||
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
|
||||
expected_probability = lexical_translation
|
||||
self.assertEqual(round(probability, 4), round(expected_probability, 4))
|
||||
@@ -0,0 +1,89 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for IBM Model 2 training methods
|
||||
"""
|
||||
|
||||
import unittest
|
||||
|
||||
from collections import defaultdict
|
||||
from nltk.translate import AlignedSent
|
||||
from nltk.translate import IBMModel
|
||||
from nltk.translate import IBMModel2
|
||||
from nltk.translate.ibm_model import AlignmentInfo
|
||||
|
||||
|
||||
class TestIBMModel2(unittest.TestCase):
|
||||
def test_set_uniform_alignment_probabilities(self):
|
||||
# arrange
|
||||
corpus = [
|
||||
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
|
||||
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
|
||||
]
|
||||
model2 = IBMModel2(corpus, 0)
|
||||
|
||||
# act
|
||||
model2.set_uniform_probabilities(corpus)
|
||||
|
||||
# assert
|
||||
# expected_prob = 1.0 / (length of source sentence + 1)
|
||||
self.assertEqual(model2.alignment_table[0][1][3][2], 1.0 / 4)
|
||||
self.assertEqual(model2.alignment_table[2][4][2][4], 1.0 / 3)
|
||||
|
||||
def test_set_uniform_alignment_probabilities_of_non_domain_values(self):
|
||||
# arrange
|
||||
corpus = [
|
||||
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
|
||||
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
|
||||
]
|
||||
model2 = IBMModel2(corpus, 0)
|
||||
|
||||
# act
|
||||
model2.set_uniform_probabilities(corpus)
|
||||
|
||||
# assert
|
||||
# examine i and j values that are not in the training data domain
|
||||
self.assertEqual(model2.alignment_table[99][1][3][2], IBMModel.MIN_PROB)
|
||||
self.assertEqual(model2.alignment_table[2][99][2][4], IBMModel.MIN_PROB)
|
||||
|
||||
def test_prob_t_a_given_s(self):
|
||||
# arrange
|
||||
src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
|
||||
trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
|
||||
corpus = [AlignedSent(trg_sentence, src_sentence)]
|
||||
alignment_info = AlignmentInfo(
|
||||
(0, 1, 4, 0, 2, 5, 5),
|
||||
[None] + src_sentence,
|
||||
['UNUSED'] + trg_sentence,
|
||||
None,
|
||||
)
|
||||
|
||||
translation_table = defaultdict(lambda: defaultdict(float))
|
||||
translation_table['i']['ich'] = 0.98
|
||||
translation_table['love']['gern'] = 0.98
|
||||
translation_table['to'][None] = 0.98
|
||||
translation_table['eat']['esse'] = 0.98
|
||||
translation_table['smoked']['räucherschinken'] = 0.98
|
||||
translation_table['ham']['räucherschinken'] = 0.98
|
||||
|
||||
alignment_table = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
|
||||
)
|
||||
alignment_table[0][3][5][6] = 0.97 # None -> to
|
||||
alignment_table[1][1][5][6] = 0.97 # ich -> i
|
||||
alignment_table[2][4][5][6] = 0.97 # esse -> eat
|
||||
alignment_table[4][2][5][6] = 0.97 # gern -> love
|
||||
alignment_table[5][5][5][6] = 0.96 # räucherschinken -> smoked
|
||||
alignment_table[5][6][5][6] = 0.96 # räucherschinken -> ham
|
||||
|
||||
model2 = IBMModel2(corpus, 0)
|
||||
model2.translation_table = translation_table
|
||||
model2.alignment_table = alignment_table
|
||||
|
||||
# act
|
||||
probability = model2.prob_t_a_given_s(alignment_info)
|
||||
|
||||
# assert
|
||||
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
|
||||
alignment = 0.97 * 0.97 * 0.97 * 0.97 * 0.96 * 0.96
|
||||
expected_probability = lexical_translation * alignment
|
||||
self.assertEqual(round(probability, 4), round(expected_probability, 4))
|
||||
@@ -0,0 +1,108 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for IBM Model 3 training methods
|
||||
"""
|
||||
|
||||
import unittest
|
||||
|
||||
from collections import defaultdict
|
||||
from nltk.translate import AlignedSent
|
||||
from nltk.translate import IBMModel
|
||||
from nltk.translate import IBMModel3
|
||||
from nltk.translate.ibm_model import AlignmentInfo
|
||||
|
||||
|
||||
class TestIBMModel3(unittest.TestCase):
|
||||
def test_set_uniform_distortion_probabilities(self):
|
||||
# arrange
|
||||
corpus = [
|
||||
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
|
||||
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
|
||||
]
|
||||
model3 = IBMModel3(corpus, 0)
|
||||
|
||||
# act
|
||||
model3.set_uniform_probabilities(corpus)
|
||||
|
||||
# assert
|
||||
# expected_prob = 1.0 / length of target sentence
|
||||
self.assertEqual(model3.distortion_table[1][0][3][2], 1.0 / 2)
|
||||
self.assertEqual(model3.distortion_table[4][2][2][4], 1.0 / 4)
|
||||
|
||||
def test_set_uniform_distortion_probabilities_of_non_domain_values(self):
|
||||
# arrange
|
||||
corpus = [
|
||||
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
|
||||
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
|
||||
]
|
||||
model3 = IBMModel3(corpus, 0)
|
||||
|
||||
# act
|
||||
model3.set_uniform_probabilities(corpus)
|
||||
|
||||
# assert
|
||||
# examine i and j values that are not in the training data domain
|
||||
self.assertEqual(model3.distortion_table[0][0][3][2], IBMModel.MIN_PROB)
|
||||
self.assertEqual(model3.distortion_table[9][2][2][4], IBMModel.MIN_PROB)
|
||||
self.assertEqual(model3.distortion_table[2][9][2][4], IBMModel.MIN_PROB)
|
||||
|
||||
def test_prob_t_a_given_s(self):
|
||||
# arrange
|
||||
src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
|
||||
trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
|
||||
corpus = [AlignedSent(trg_sentence, src_sentence)]
|
||||
alignment_info = AlignmentInfo(
|
||||
(0, 1, 4, 0, 2, 5, 5),
|
||||
[None] + src_sentence,
|
||||
['UNUSED'] + trg_sentence,
|
||||
[[3], [1], [4], [], [2], [5, 6]],
|
||||
)
|
||||
|
||||
distortion_table = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
|
||||
)
|
||||
distortion_table[1][1][5][6] = 0.97 # i -> ich
|
||||
distortion_table[2][4][5][6] = 0.97 # love -> gern
|
||||
distortion_table[3][0][5][6] = 0.97 # to -> NULL
|
||||
distortion_table[4][2][5][6] = 0.97 # eat -> esse
|
||||
distortion_table[5][5][5][6] = 0.97 # smoked -> räucherschinken
|
||||
distortion_table[6][5][5][6] = 0.97 # ham -> räucherschinken
|
||||
|
||||
translation_table = defaultdict(lambda: defaultdict(float))
|
||||
translation_table['i']['ich'] = 0.98
|
||||
translation_table['love']['gern'] = 0.98
|
||||
translation_table['to'][None] = 0.98
|
||||
translation_table['eat']['esse'] = 0.98
|
||||
translation_table['smoked']['räucherschinken'] = 0.98
|
||||
translation_table['ham']['räucherschinken'] = 0.98
|
||||
|
||||
fertility_table = defaultdict(lambda: defaultdict(float))
|
||||
fertility_table[1]['ich'] = 0.99
|
||||
fertility_table[1]['esse'] = 0.99
|
||||
fertility_table[0]['ja'] = 0.99
|
||||
fertility_table[1]['gern'] = 0.99
|
||||
fertility_table[2]['räucherschinken'] = 0.999
|
||||
fertility_table[1][None] = 0.99
|
||||
|
||||
probabilities = {
|
||||
'p1': 0.167,
|
||||
'translation_table': translation_table,
|
||||
'distortion_table': distortion_table,
|
||||
'fertility_table': fertility_table,
|
||||
'alignment_table': None,
|
||||
}
|
||||
|
||||
model3 = IBMModel3(corpus, 0, probabilities)
|
||||
|
||||
# act
|
||||
probability = model3.prob_t_a_given_s(alignment_info)
|
||||
|
||||
# assert
|
||||
null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
|
||||
fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999
|
||||
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
|
||||
distortion = 0.97 * 0.97 * 0.97 * 0.97 * 0.97 * 0.97
|
||||
expected_probability = (
|
||||
null_generation * fertility * lexical_translation * distortion
|
||||
)
|
||||
self.assertEqual(round(probability, 4), round(expected_probability, 4))
|
||||
@@ -0,0 +1,123 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for IBM Model 4 training methods
|
||||
"""
|
||||
|
||||
import unittest
|
||||
|
||||
from collections import defaultdict
|
||||
from nltk.translate import AlignedSent
|
||||
from nltk.translate import IBMModel
|
||||
from nltk.translate import IBMModel4
|
||||
from nltk.translate.ibm_model import AlignmentInfo
|
||||
|
||||
|
||||
class TestIBMModel4(unittest.TestCase):
|
||||
def test_set_uniform_distortion_probabilities_of_max_displacements(self):
|
||||
# arrange
|
||||
src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
|
||||
trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
|
||||
corpus = [
|
||||
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
|
||||
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
|
||||
]
|
||||
model4 = IBMModel4(corpus, 0, src_classes, trg_classes)
|
||||
|
||||
# act
|
||||
model4.set_uniform_probabilities(corpus)
|
||||
|
||||
# assert
|
||||
# number of displacement values =
|
||||
# 2 *(number of words in longest target sentence - 1)
|
||||
expected_prob = 1.0 / (2 * (4 - 1))
|
||||
|
||||
# examine the boundary values for (displacement, src_class, trg_class)
|
||||
self.assertEqual(model4.head_distortion_table[3][0][0], expected_prob)
|
||||
self.assertEqual(model4.head_distortion_table[-3][1][2], expected_prob)
|
||||
self.assertEqual(model4.non_head_distortion_table[3][0], expected_prob)
|
||||
self.assertEqual(model4.non_head_distortion_table[-3][2], expected_prob)
|
||||
|
||||
def test_set_uniform_distortion_probabilities_of_non_domain_values(self):
|
||||
# arrange
|
||||
src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
|
||||
trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
|
||||
corpus = [
|
||||
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
|
||||
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
|
||||
]
|
||||
model4 = IBMModel4(corpus, 0, src_classes, trg_classes)
|
||||
|
||||
# act
|
||||
model4.set_uniform_probabilities(corpus)
|
||||
|
||||
# assert
|
||||
# examine displacement values that are not in the training data domain
|
||||
self.assertEqual(model4.head_distortion_table[4][0][0], IBMModel.MIN_PROB)
|
||||
self.assertEqual(model4.head_distortion_table[100][1][2], IBMModel.MIN_PROB)
|
||||
self.assertEqual(model4.non_head_distortion_table[4][0], IBMModel.MIN_PROB)
|
||||
self.assertEqual(model4.non_head_distortion_table[100][2], IBMModel.MIN_PROB)
|
||||
|
||||
def test_prob_t_a_given_s(self):
|
||||
# arrange
|
||||
src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
|
||||
trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
|
||||
src_classes = {'räucherschinken': 0, 'ja': 1, 'ich': 2, 'esse': 3, 'gern': 4}
|
||||
trg_classes = {'ham': 0, 'smoked': 1, 'i': 3, 'love': 4, 'to': 2, 'eat': 4}
|
||||
corpus = [AlignedSent(trg_sentence, src_sentence)]
|
||||
alignment_info = AlignmentInfo(
|
||||
(0, 1, 4, 0, 2, 5, 5),
|
||||
[None] + src_sentence,
|
||||
['UNUSED'] + trg_sentence,
|
||||
[[3], [1], [4], [], [2], [5, 6]],
|
||||
)
|
||||
|
||||
head_distortion_table = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(float))
|
||||
)
|
||||
head_distortion_table[1][None][3] = 0.97 # None, i
|
||||
head_distortion_table[3][2][4] = 0.97 # ich, eat
|
||||
head_distortion_table[-2][3][4] = 0.97 # esse, love
|
||||
head_distortion_table[3][4][1] = 0.97 # gern, smoked
|
||||
|
||||
non_head_distortion_table = defaultdict(lambda: defaultdict(float))
|
||||
non_head_distortion_table[1][0] = 0.96 # ham
|
||||
|
||||
translation_table = defaultdict(lambda: defaultdict(float))
|
||||
translation_table['i']['ich'] = 0.98
|
||||
translation_table['love']['gern'] = 0.98
|
||||
translation_table['to'][None] = 0.98
|
||||
translation_table['eat']['esse'] = 0.98
|
||||
translation_table['smoked']['räucherschinken'] = 0.98
|
||||
translation_table['ham']['räucherschinken'] = 0.98
|
||||
|
||||
fertility_table = defaultdict(lambda: defaultdict(float))
|
||||
fertility_table[1]['ich'] = 0.99
|
||||
fertility_table[1]['esse'] = 0.99
|
||||
fertility_table[0]['ja'] = 0.99
|
||||
fertility_table[1]['gern'] = 0.99
|
||||
fertility_table[2]['räucherschinken'] = 0.999
|
||||
fertility_table[1][None] = 0.99
|
||||
|
||||
probabilities = {
|
||||
'p1': 0.167,
|
||||
'translation_table': translation_table,
|
||||
'head_distortion_table': head_distortion_table,
|
||||
'non_head_distortion_table': non_head_distortion_table,
|
||||
'fertility_table': fertility_table,
|
||||
'alignment_table': None,
|
||||
}
|
||||
|
||||
model4 = IBMModel4(corpus, 0, src_classes, trg_classes, probabilities)
|
||||
|
||||
# act
|
||||
probability = model4.prob_t_a_given_s(alignment_info)
|
||||
|
||||
# assert
|
||||
null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
|
||||
fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999
|
||||
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
|
||||
distortion = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96
|
||||
expected_probability = (
|
||||
null_generation * fertility * lexical_translation * distortion
|
||||
)
|
||||
self.assertEqual(round(probability, 4), round(expected_probability, 4))
|
||||
@@ -0,0 +1,164 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for IBM Model 5 training methods
|
||||
"""
|
||||
|
||||
import unittest
|
||||
|
||||
from collections import defaultdict
|
||||
from nltk.translate import AlignedSent
|
||||
from nltk.translate import IBMModel
|
||||
from nltk.translate import IBMModel4
|
||||
from nltk.translate import IBMModel5
|
||||
from nltk.translate.ibm_model import AlignmentInfo
|
||||
|
||||
|
||||
class TestIBMModel5(unittest.TestCase):
|
||||
def test_set_uniform_vacancy_probabilities_of_max_displacements(self):
|
||||
# arrange
|
||||
src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
|
||||
trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
|
||||
corpus = [
|
||||
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
|
||||
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
|
||||
]
|
||||
model5 = IBMModel5(corpus, 0, src_classes, trg_classes)
|
||||
|
||||
# act
|
||||
model5.set_uniform_probabilities(corpus)
|
||||
|
||||
# assert
|
||||
# number of vacancy difference values =
|
||||
# 2 * number of words in longest target sentence
|
||||
expected_prob = 1.0 / (2 * 4)
|
||||
|
||||
# examine the boundary values for (dv, max_v, trg_class)
|
||||
self.assertEqual(model5.head_vacancy_table[4][4][0], expected_prob)
|
||||
self.assertEqual(model5.head_vacancy_table[-3][1][2], expected_prob)
|
||||
self.assertEqual(model5.non_head_vacancy_table[4][4][0], expected_prob)
|
||||
self.assertEqual(model5.non_head_vacancy_table[-3][1][2], expected_prob)
|
||||
|
||||
def test_set_uniform_vacancy_probabilities_of_non_domain_values(self):
|
||||
# arrange
|
||||
src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
|
||||
trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
|
||||
corpus = [
|
||||
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
|
||||
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
|
||||
]
|
||||
model5 = IBMModel5(corpus, 0, src_classes, trg_classes)
|
||||
|
||||
# act
|
||||
model5.set_uniform_probabilities(corpus)
|
||||
|
||||
# assert
|
||||
# examine dv and max_v values that are not in the training data domain
|
||||
self.assertEqual(model5.head_vacancy_table[5][4][0], IBMModel.MIN_PROB)
|
||||
self.assertEqual(model5.head_vacancy_table[-4][1][2], IBMModel.MIN_PROB)
|
||||
self.assertEqual(model5.head_vacancy_table[4][0][0], IBMModel.MIN_PROB)
|
||||
self.assertEqual(model5.non_head_vacancy_table[5][4][0], IBMModel.MIN_PROB)
|
||||
self.assertEqual(model5.non_head_vacancy_table[-4][1][2], IBMModel.MIN_PROB)
|
||||
|
||||
def test_prob_t_a_given_s(self):
|
||||
# arrange
|
||||
src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
|
||||
trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
|
||||
src_classes = {'räucherschinken': 0, 'ja': 1, 'ich': 2, 'esse': 3, 'gern': 4}
|
||||
trg_classes = {'ham': 0, 'smoked': 1, 'i': 3, 'love': 4, 'to': 2, 'eat': 4}
|
||||
corpus = [AlignedSent(trg_sentence, src_sentence)]
|
||||
alignment_info = AlignmentInfo(
|
||||
(0, 1, 4, 0, 2, 5, 5),
|
||||
[None] + src_sentence,
|
||||
['UNUSED'] + trg_sentence,
|
||||
[[3], [1], [4], [], [2], [5, 6]],
|
||||
)
|
||||
|
||||
head_vacancy_table = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(float))
|
||||
)
|
||||
head_vacancy_table[1 - 0][6][3] = 0.97 # ich -> i
|
||||
head_vacancy_table[3 - 0][5][4] = 0.97 # esse -> eat
|
||||
head_vacancy_table[1 - 2][4][4] = 0.97 # gern -> love
|
||||
head_vacancy_table[2 - 0][2][1] = 0.97 # räucherschinken -> smoked
|
||||
|
||||
non_head_vacancy_table = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(float))
|
||||
)
|
||||
non_head_vacancy_table[1 - 0][1][0] = 0.96 # räucherschinken -> ham
|
||||
|
||||
translation_table = defaultdict(lambda: defaultdict(float))
|
||||
translation_table['i']['ich'] = 0.98
|
||||
translation_table['love']['gern'] = 0.98
|
||||
translation_table['to'][None] = 0.98
|
||||
translation_table['eat']['esse'] = 0.98
|
||||
translation_table['smoked']['räucherschinken'] = 0.98
|
||||
translation_table['ham']['räucherschinken'] = 0.98
|
||||
|
||||
fertility_table = defaultdict(lambda: defaultdict(float))
|
||||
fertility_table[1]['ich'] = 0.99
|
||||
fertility_table[1]['esse'] = 0.99
|
||||
fertility_table[0]['ja'] = 0.99
|
||||
fertility_table[1]['gern'] = 0.99
|
||||
fertility_table[2]['räucherschinken'] = 0.999
|
||||
fertility_table[1][None] = 0.99
|
||||
|
||||
probabilities = {
|
||||
'p1': 0.167,
|
||||
'translation_table': translation_table,
|
||||
'fertility_table': fertility_table,
|
||||
'head_vacancy_table': head_vacancy_table,
|
||||
'non_head_vacancy_table': non_head_vacancy_table,
|
||||
'head_distortion_table': None,
|
||||
'non_head_distortion_table': None,
|
||||
'alignment_table': None,
|
||||
}
|
||||
|
||||
model5 = IBMModel5(corpus, 0, src_classes, trg_classes, probabilities)
|
||||
|
||||
# act
|
||||
probability = model5.prob_t_a_given_s(alignment_info)
|
||||
|
||||
# assert
|
||||
null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
|
||||
fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999
|
||||
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
|
||||
vacancy = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96
|
||||
expected_probability = (
|
||||
null_generation * fertility * lexical_translation * vacancy
|
||||
)
|
||||
self.assertEqual(round(probability, 4), round(expected_probability, 4))
|
||||
|
||||
def test_prune(self):
|
||||
# arrange
|
||||
alignment_infos = [
|
||||
AlignmentInfo((1, 1), None, None, None),
|
||||
AlignmentInfo((1, 2), None, None, None),
|
||||
AlignmentInfo((2, 1), None, None, None),
|
||||
AlignmentInfo((2, 2), None, None, None),
|
||||
AlignmentInfo((0, 0), None, None, None),
|
||||
]
|
||||
min_factor = IBMModel5.MIN_SCORE_FACTOR
|
||||
best_score = 0.9
|
||||
scores = {
|
||||
(1, 1): min(min_factor * 1.5, 1) * best_score, # above threshold
|
||||
(1, 2): best_score,
|
||||
(2, 1): min_factor * best_score, # at threshold
|
||||
(2, 2): min_factor * best_score * 0.5, # low score
|
||||
(0, 0): min(min_factor * 1.1, 1) * 1.2, # above threshold
|
||||
}
|
||||
corpus = [AlignedSent(['a'], ['b'])]
|
||||
original_prob_function = IBMModel4.model4_prob_t_a_given_s
|
||||
# mock static method
|
||||
IBMModel4.model4_prob_t_a_given_s = staticmethod(
|
||||
lambda a, model: scores[a.alignment]
|
||||
)
|
||||
model5 = IBMModel5(corpus, 0, None, None)
|
||||
|
||||
# act
|
||||
pruned_alignments = model5.prune(alignment_infos)
|
||||
|
||||
# assert
|
||||
self.assertEqual(len(pruned_alignments), 3)
|
||||
|
||||
# restore static method
|
||||
IBMModel4.model4_prob_t_a_given_s = original_prob_function
|
||||
@@ -0,0 +1,279 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for common methods of IBM translation models
|
||||
"""
|
||||
|
||||
import unittest
|
||||
|
||||
from collections import defaultdict
|
||||
from nltk.translate import AlignedSent
|
||||
from nltk.translate import IBMModel
|
||||
from nltk.translate.ibm_model import AlignmentInfo
|
||||
|
||||
|
||||
class TestIBMModel(unittest.TestCase):
|
||||
__TEST_SRC_SENTENCE = ["j'", 'aime', 'bien', 'jambon']
|
||||
__TEST_TRG_SENTENCE = ['i', 'love', 'ham']
|
||||
|
||||
def test_vocabularies_are_initialized(self):
|
||||
parallel_corpora = [
|
||||
AlignedSent(['one', 'two', 'three', 'four'], ['un', 'deux', 'trois']),
|
||||
AlignedSent(['five', 'one', 'six'], ['quatre', 'cinq', 'six']),
|
||||
AlignedSent([], ['sept']),
|
||||
]
|
||||
|
||||
ibm_model = IBMModel(parallel_corpora)
|
||||
self.assertEqual(len(ibm_model.src_vocab), 8)
|
||||
self.assertEqual(len(ibm_model.trg_vocab), 6)
|
||||
|
||||
def test_vocabularies_are_initialized_even_with_empty_corpora(self):
|
||||
parallel_corpora = []
|
||||
|
||||
ibm_model = IBMModel(parallel_corpora)
|
||||
self.assertEqual(len(ibm_model.src_vocab), 1) # addition of NULL token
|
||||
self.assertEqual(len(ibm_model.trg_vocab), 0)
|
||||
|
||||
def test_best_model2_alignment(self):
|
||||
# arrange
|
||||
sentence_pair = AlignedSent(
|
||||
TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
|
||||
)
|
||||
# None and 'bien' have zero fertility
|
||||
translation_table = {
|
||||
'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0},
|
||||
'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03},
|
||||
'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0},
|
||||
}
|
||||
alignment_table = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
|
||||
)
|
||||
|
||||
ibm_model = IBMModel([])
|
||||
ibm_model.translation_table = translation_table
|
||||
ibm_model.alignment_table = alignment_table
|
||||
|
||||
# act
|
||||
a_info = ibm_model.best_model2_alignment(sentence_pair)
|
||||
|
||||
# assert
|
||||
self.assertEqual(a_info.alignment[1:], (1, 2, 4)) # 0th element unused
|
||||
self.assertEqual(a_info.cepts, [[], [1], [2], [], [3]])
|
||||
|
||||
def test_best_model2_alignment_does_not_change_pegged_alignment(self):
|
||||
# arrange
|
||||
sentence_pair = AlignedSent(
|
||||
TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
|
||||
)
|
||||
translation_table = {
|
||||
'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0},
|
||||
'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03},
|
||||
'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0},
|
||||
}
|
||||
alignment_table = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
|
||||
)
|
||||
|
||||
ibm_model = IBMModel([])
|
||||
ibm_model.translation_table = translation_table
|
||||
ibm_model.alignment_table = alignment_table
|
||||
|
||||
# act: force 'love' to be pegged to 'jambon'
|
||||
a_info = ibm_model.best_model2_alignment(sentence_pair, 2, 4)
|
||||
# assert
|
||||
self.assertEqual(a_info.alignment[1:], (1, 4, 4))
|
||||
self.assertEqual(a_info.cepts, [[], [1], [], [], [2, 3]])
|
||||
|
||||
def test_best_model2_alignment_handles_fertile_words(self):
|
||||
# arrange
|
||||
sentence_pair = AlignedSent(
|
||||
['i', 'really', ',', 'really', 'love', 'ham'],
|
||||
TestIBMModel.__TEST_SRC_SENTENCE,
|
||||
)
|
||||
# 'bien' produces 2 target words: 'really' and another 'really'
|
||||
translation_table = {
|
||||
'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0},
|
||||
'really': {"j'": 0, 'aime': 0, 'bien': 0.9, 'jambon': 0.01, None: 0.09},
|
||||
',': {"j'": 0, 'aime': 0, 'bien': 0.3, 'jambon': 0, None: 0.7},
|
||||
'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03},
|
||||
'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0},
|
||||
}
|
||||
alignment_table = defaultdict(
|
||||
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
|
||||
)
|
||||
|
||||
ibm_model = IBMModel([])
|
||||
ibm_model.translation_table = translation_table
|
||||
ibm_model.alignment_table = alignment_table
|
||||
|
||||
# act
|
||||
a_info = ibm_model.best_model2_alignment(sentence_pair)
|
||||
|
||||
# assert
|
||||
self.assertEqual(a_info.alignment[1:], (1, 3, 0, 3, 2, 4))
|
||||
self.assertEqual(a_info.cepts, [[3], [1], [5], [2, 4], [6]])
|
||||
|
||||
def test_best_model2_alignment_handles_empty_src_sentence(self):
|
||||
# arrange
|
||||
sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, [])
|
||||
ibm_model = IBMModel([])
|
||||
|
||||
# act
|
||||
a_info = ibm_model.best_model2_alignment(sentence_pair)
|
||||
|
||||
# assert
|
||||
self.assertEqual(a_info.alignment[1:], (0, 0, 0))
|
||||
self.assertEqual(a_info.cepts, [[1, 2, 3]])
|
||||
|
||||
def test_best_model2_alignment_handles_empty_trg_sentence(self):
|
||||
# arrange
|
||||
sentence_pair = AlignedSent([], TestIBMModel.__TEST_SRC_SENTENCE)
|
||||
ibm_model = IBMModel([])
|
||||
|
||||
# act
|
||||
a_info = ibm_model.best_model2_alignment(sentence_pair)
|
||||
|
||||
# assert
|
||||
self.assertEqual(a_info.alignment[1:], ())
|
||||
self.assertEqual(a_info.cepts, [[], [], [], [], []])
|
||||
|
||||
def test_neighboring_finds_neighbor_alignments(self):
|
||||
# arrange
|
||||
a_info = AlignmentInfo(
|
||||
(0, 3, 2),
|
||||
(None, 'des', 'œufs', 'verts'),
|
||||
('UNUSED', 'green', 'eggs'),
|
||||
[[], [], [2], [1]],
|
||||
)
|
||||
ibm_model = IBMModel([])
|
||||
|
||||
# act
|
||||
neighbors = ibm_model.neighboring(a_info)
|
||||
|
||||
# assert
|
||||
neighbor_alignments = set()
|
||||
for neighbor in neighbors:
|
||||
neighbor_alignments.add(neighbor.alignment)
|
||||
expected_alignments = set(
|
||||
[
|
||||
# moves
|
||||
(0, 0, 2),
|
||||
(0, 1, 2),
|
||||
(0, 2, 2),
|
||||
(0, 3, 0),
|
||||
(0, 3, 1),
|
||||
(0, 3, 3),
|
||||
# swaps
|
||||
(0, 2, 3),
|
||||
# original alignment
|
||||
(0, 3, 2),
|
||||
]
|
||||
)
|
||||
self.assertEqual(neighbor_alignments, expected_alignments)
|
||||
|
||||
def test_neighboring_sets_neighbor_alignment_info(self):
|
||||
# arrange
|
||||
a_info = AlignmentInfo(
|
||||
(0, 3, 2),
|
||||
(None, 'des', 'œufs', 'verts'),
|
||||
('UNUSED', 'green', 'eggs'),
|
||||
[[], [], [2], [1]],
|
||||
)
|
||||
ibm_model = IBMModel([])
|
||||
|
||||
# act
|
||||
neighbors = ibm_model.neighboring(a_info)
|
||||
|
||||
# assert: select a few particular alignments
|
||||
for neighbor in neighbors:
|
||||
if neighbor.alignment == (0, 2, 2):
|
||||
moved_alignment = neighbor
|
||||
elif neighbor.alignment == (0, 3, 2):
|
||||
swapped_alignment = neighbor
|
||||
|
||||
self.assertEqual(moved_alignment.cepts, [[], [], [1, 2], []])
|
||||
self.assertEqual(swapped_alignment.cepts, [[], [], [2], [1]])
|
||||
|
||||
def test_neighboring_returns_neighbors_with_pegged_alignment(self):
|
||||
# arrange
|
||||
a_info = AlignmentInfo(
|
||||
(0, 3, 2),
|
||||
(None, 'des', 'œufs', 'verts'),
|
||||
('UNUSED', 'green', 'eggs'),
|
||||
[[], [], [2], [1]],
|
||||
)
|
||||
ibm_model = IBMModel([])
|
||||
|
||||
# act: peg 'eggs' to align with 'œufs'
|
||||
neighbors = ibm_model.neighboring(a_info, 2)
|
||||
|
||||
# assert
|
||||
neighbor_alignments = set()
|
||||
for neighbor in neighbors:
|
||||
neighbor_alignments.add(neighbor.alignment)
|
||||
expected_alignments = set(
|
||||
[
|
||||
# moves
|
||||
(0, 0, 2),
|
||||
(0, 1, 2),
|
||||
(0, 2, 2),
|
||||
# no swaps
|
||||
# original alignment
|
||||
(0, 3, 2),
|
||||
]
|
||||
)
|
||||
self.assertEqual(neighbor_alignments, expected_alignments)
|
||||
|
||||
def test_hillclimb(self):
|
||||
# arrange
|
||||
initial_alignment = AlignmentInfo((0, 3, 2), None, None, None)
|
||||
|
||||
def neighboring_mock(a, j):
|
||||
if a.alignment == (0, 3, 2):
|
||||
return set(
|
||||
[
|
||||
AlignmentInfo((0, 2, 2), None, None, None),
|
||||
AlignmentInfo((0, 1, 1), None, None, None),
|
||||
]
|
||||
)
|
||||
elif a.alignment == (0, 2, 2):
|
||||
return set(
|
||||
[
|
||||
AlignmentInfo((0, 3, 3), None, None, None),
|
||||
AlignmentInfo((0, 4, 4), None, None, None),
|
||||
]
|
||||
)
|
||||
return set()
|
||||
|
||||
def prob_t_a_given_s_mock(a):
|
||||
prob_values = {
|
||||
(0, 3, 2): 0.5,
|
||||
(0, 2, 2): 0.6,
|
||||
(0, 1, 1): 0.4,
|
||||
(0, 3, 3): 0.6,
|
||||
(0, 4, 4): 0.7,
|
||||
}
|
||||
return prob_values.get(a.alignment, 0.01)
|
||||
|
||||
ibm_model = IBMModel([])
|
||||
ibm_model.neighboring = neighboring_mock
|
||||
ibm_model.prob_t_a_given_s = prob_t_a_given_s_mock
|
||||
|
||||
# act
|
||||
best_alignment = ibm_model.hillclimb(initial_alignment)
|
||||
|
||||
# assert: hill climbing goes from (0, 3, 2) -> (0, 2, 2) -> (0, 4, 4)
|
||||
self.assertEqual(best_alignment.alignment, (0, 4, 4))
|
||||
|
||||
def test_sample(self):
|
||||
# arrange
|
||||
sentence_pair = AlignedSent(
|
||||
TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
|
||||
)
|
||||
ibm_model = IBMModel([])
|
||||
ibm_model.prob_t_a_given_s = lambda x: 0.001
|
||||
|
||||
# act
|
||||
samples, best_alignment = ibm_model.sample(sentence_pair)
|
||||
|
||||
# assert
|
||||
self.assertEqual(len(samples), 61)
|
||||
@@ -0,0 +1,37 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for NIST translation evaluation metric
|
||||
"""
|
||||
|
||||
import io
|
||||
import unittest
|
||||
|
||||
from nltk.data import find
|
||||
from nltk.translate.nist_score import sentence_nist, corpus_nist
|
||||
|
||||
|
||||
class TestNIST(unittest.TestCase):
|
||||
def test_sentence_nist(self):
|
||||
ref_file = find('models/wmt15_eval/ref.ru')
|
||||
hyp_file = find('models/wmt15_eval/google.ru')
|
||||
mteval_output_file = find('models/wmt15_eval/mteval-13a.output')
|
||||
|
||||
# Reads the NIST scores from the `mteval-13a.output` file.
|
||||
# The order of the list corresponds to the order of the ngrams.
|
||||
with open(mteval_output_file, 'r') as mteval_fin:
|
||||
# The numbers are located in the last 4th line of the file.
|
||||
# The first and 2nd item in the list are the score and system names.
|
||||
mteval_nist_scores = map(float, mteval_fin.readlines()[-4].split()[1:-1])
|
||||
|
||||
with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
|
||||
with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
|
||||
# Whitespace tokenize the file.
|
||||
# Note: split() automatically strip().
|
||||
hypotheses = list(map(lambda x: x.split(), hyp_fin))
|
||||
# Note that the corpus_bleu input is list of list of references.
|
||||
references = list(map(lambda x: [x.split()], ref_fin))
|
||||
# Without smoothing.
|
||||
for i, mteval_nist in zip(range(1, 10), mteval_nist_scores):
|
||||
nltk_nist = corpus_nist(references, hypotheses, i)
|
||||
# Check that the NIST scores difference is less than 0.5
|
||||
assert abs(mteval_nist - nltk_nist) < 0.05
|
||||
@@ -0,0 +1,295 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Natural Language Toolkit: Stack decoder
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Tests for stack decoder
|
||||
"""
|
||||
|
||||
import unittest
|
||||
from collections import defaultdict
|
||||
from math import log
|
||||
from nltk.translate import PhraseTable
|
||||
from nltk.translate import StackDecoder
|
||||
from nltk.translate.stack_decoder import _Hypothesis, _Stack
|
||||
|
||||
|
||||
class TestStackDecoder(unittest.TestCase):
|
||||
def test_find_all_src_phrases(self):
|
||||
# arrange
|
||||
phrase_table = TestStackDecoder.create_fake_phrase_table()
|
||||
stack_decoder = StackDecoder(phrase_table, None)
|
||||
sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels')
|
||||
|
||||
# act
|
||||
src_phrase_spans = stack_decoder.find_all_src_phrases(sentence)
|
||||
|
||||
# assert
|
||||
self.assertEqual(src_phrase_spans[0], [2]) # 'my hovercraft'
|
||||
self.assertEqual(src_phrase_spans[1], [2]) # 'hovercraft'
|
||||
self.assertEqual(src_phrase_spans[2], [3]) # 'is'
|
||||
self.assertEqual(src_phrase_spans[3], [5, 6]) # 'full of', 'full of eels'
|
||||
self.assertFalse(src_phrase_spans[4]) # no entry starting with 'of'
|
||||
self.assertEqual(src_phrase_spans[5], [6]) # 'eels'
|
||||
|
||||
def test_distortion_score(self):
|
||||
# arrange
|
||||
stack_decoder = StackDecoder(None, None)
|
||||
stack_decoder.distortion_factor = 0.5
|
||||
hypothesis = _Hypothesis()
|
||||
hypothesis.src_phrase_span = (3, 5)
|
||||
|
||||
# act
|
||||
score = stack_decoder.distortion_score(hypothesis, (8, 10))
|
||||
|
||||
# assert
|
||||
expected_score = log(stack_decoder.distortion_factor) * (8 - 5)
|
||||
self.assertEqual(score, expected_score)
|
||||
|
||||
def test_distortion_score_of_first_expansion(self):
|
||||
# arrange
|
||||
stack_decoder = StackDecoder(None, None)
|
||||
stack_decoder.distortion_factor = 0.5
|
||||
hypothesis = _Hypothesis()
|
||||
|
||||
# act
|
||||
score = stack_decoder.distortion_score(hypothesis, (8, 10))
|
||||
|
||||
# assert
|
||||
# expansion from empty hypothesis always has zero distortion cost
|
||||
self.assertEqual(score, 0.0)
|
||||
|
||||
def test_compute_future_costs(self):
|
||||
# arrange
|
||||
phrase_table = TestStackDecoder.create_fake_phrase_table()
|
||||
language_model = TestStackDecoder.create_fake_language_model()
|
||||
stack_decoder = StackDecoder(phrase_table, language_model)
|
||||
sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels')
|
||||
|
||||
# act
|
||||
future_scores = stack_decoder.compute_future_scores(sentence)
|
||||
|
||||
# assert
|
||||
self.assertEqual(
|
||||
future_scores[1][2],
|
||||
(
|
||||
phrase_table.translations_for(('hovercraft',))[0].log_prob
|
||||
+ language_model.probability(('hovercraft',))
|
||||
),
|
||||
)
|
||||
self.assertEqual(
|
||||
future_scores[0][2],
|
||||
(
|
||||
phrase_table.translations_for(('my', 'hovercraft'))[0].log_prob
|
||||
+ language_model.probability(('my', 'hovercraft'))
|
||||
),
|
||||
)
|
||||
|
||||
def test_compute_future_costs_for_phrases_not_in_phrase_table(self):
|
||||
# arrange
|
||||
phrase_table = TestStackDecoder.create_fake_phrase_table()
|
||||
language_model = TestStackDecoder.create_fake_language_model()
|
||||
stack_decoder = StackDecoder(phrase_table, language_model)
|
||||
sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels')
|
||||
|
||||
# act
|
||||
future_scores = stack_decoder.compute_future_scores(sentence)
|
||||
|
||||
# assert
|
||||
self.assertEqual(
|
||||
future_scores[1][3], # 'hovercraft is' is not in phrase table
|
||||
future_scores[1][2] + future_scores[2][3],
|
||||
) # backoff
|
||||
|
||||
def test_future_score(self):
|
||||
# arrange: sentence with 8 words; words 2, 3, 4 already translated
|
||||
hypothesis = _Hypothesis()
|
||||
hypothesis.untranslated_spans = lambda _: [(0, 2), (5, 8)] # mock
|
||||
future_score_table = defaultdict(lambda: defaultdict(float))
|
||||
future_score_table[0][2] = 0.4
|
||||
future_score_table[5][8] = 0.5
|
||||
stack_decoder = StackDecoder(None, None)
|
||||
|
||||
# act
|
||||
future_score = stack_decoder.future_score(hypothesis, future_score_table, 8)
|
||||
|
||||
# assert
|
||||
self.assertEqual(future_score, 0.4 + 0.5)
|
||||
|
||||
def test_valid_phrases(self):
|
||||
# arrange
|
||||
hypothesis = _Hypothesis()
|
||||
# mock untranslated_spans method
|
||||
hypothesis.untranslated_spans = lambda _: [(0, 2), (3, 6)]
|
||||
all_phrases_from = [[1, 4], [2], [], [5], [5, 6, 7], [], [7]]
|
||||
|
||||
# act
|
||||
phrase_spans = StackDecoder.valid_phrases(all_phrases_from, hypothesis)
|
||||
|
||||
# assert
|
||||
self.assertEqual(phrase_spans, [(0, 1), (1, 2), (3, 5), (4, 5), (4, 6)])
|
||||
|
||||
@staticmethod
|
||||
def create_fake_phrase_table():
|
||||
phrase_table = PhraseTable()
|
||||
phrase_table.add(('hovercraft',), ('',), 0.8)
|
||||
phrase_table.add(('my', 'hovercraft'), ('', ''), 0.7)
|
||||
phrase_table.add(('my', 'cheese'), ('', ''), 0.7)
|
||||
phrase_table.add(('is',), ('',), 0.8)
|
||||
phrase_table.add(('is',), ('',), 0.5)
|
||||
phrase_table.add(('full', 'of'), ('', ''), 0.01)
|
||||
phrase_table.add(('full', 'of', 'eels'), ('', '', ''), 0.5)
|
||||
phrase_table.add(('full', 'of', 'spam'), ('', ''), 0.5)
|
||||
phrase_table.add(('eels',), ('',), 0.5)
|
||||
phrase_table.add(('spam',), ('',), 0.5)
|
||||
return phrase_table
|
||||
|
||||
@staticmethod
|
||||
def create_fake_language_model():
|
||||
# nltk.model should be used here once it is implemented
|
||||
language_prob = defaultdict(lambda: -999.0)
|
||||
language_prob[('my',)] = log(0.1)
|
||||
language_prob[('hovercraft',)] = log(0.1)
|
||||
language_prob[('is',)] = log(0.1)
|
||||
language_prob[('full',)] = log(0.1)
|
||||
language_prob[('of',)] = log(0.1)
|
||||
language_prob[('eels',)] = log(0.1)
|
||||
language_prob[('my', 'hovercraft')] = log(0.3)
|
||||
language_model = type(
|
||||
'', (object,), {'probability': lambda _, phrase: language_prob[phrase]}
|
||||
)()
|
||||
return language_model
|
||||
|
||||
|
||||
class TestHypothesis(unittest.TestCase):
|
||||
def setUp(self):
|
||||
root = _Hypothesis()
|
||||
child = _Hypothesis(
|
||||
raw_score=0.5,
|
||||
src_phrase_span=(3, 7),
|
||||
trg_phrase=('hello', 'world'),
|
||||
previous=root,
|
||||
)
|
||||
grandchild = _Hypothesis(
|
||||
raw_score=0.4,
|
||||
src_phrase_span=(1, 2),
|
||||
trg_phrase=('and', 'goodbye'),
|
||||
previous=child,
|
||||
)
|
||||
self.hypothesis_chain = grandchild
|
||||
|
||||
def test_translation_so_far(self):
|
||||
# act
|
||||
translation = self.hypothesis_chain.translation_so_far()
|
||||
|
||||
# assert
|
||||
self.assertEqual(translation, ['hello', 'world', 'and', 'goodbye'])
|
||||
|
||||
def test_translation_so_far_for_empty_hypothesis(self):
|
||||
# arrange
|
||||
hypothesis = _Hypothesis()
|
||||
|
||||
# act
|
||||
translation = hypothesis.translation_so_far()
|
||||
|
||||
# assert
|
||||
self.assertEqual(translation, [])
|
||||
|
||||
def test_total_translated_words(self):
|
||||
# act
|
||||
total_translated_words = self.hypothesis_chain.total_translated_words()
|
||||
|
||||
# assert
|
||||
self.assertEqual(total_translated_words, 5)
|
||||
|
||||
def test_translated_positions(self):
|
||||
# act
|
||||
translated_positions = self.hypothesis_chain.translated_positions()
|
||||
|
||||
# assert
|
||||
translated_positions.sort()
|
||||
self.assertEqual(translated_positions, [1, 3, 4, 5, 6])
|
||||
|
||||
def test_untranslated_spans(self):
|
||||
# act
|
||||
untranslated_spans = self.hypothesis_chain.untranslated_spans(10)
|
||||
|
||||
# assert
|
||||
self.assertEqual(untranslated_spans, [(0, 1), (2, 3), (7, 10)])
|
||||
|
||||
def test_untranslated_spans_for_empty_hypothesis(self):
|
||||
# arrange
|
||||
hypothesis = _Hypothesis()
|
||||
|
||||
# act
|
||||
untranslated_spans = hypothesis.untranslated_spans(10)
|
||||
|
||||
# assert
|
||||
self.assertEqual(untranslated_spans, [(0, 10)])
|
||||
|
||||
|
||||
class TestStack(unittest.TestCase):
|
||||
def test_push_bumps_off_worst_hypothesis_when_stack_is_full(self):
|
||||
# arrange
|
||||
stack = _Stack(3)
|
||||
poor_hypothesis = _Hypothesis(0.01)
|
||||
|
||||
# act
|
||||
stack.push(_Hypothesis(0.2))
|
||||
stack.push(poor_hypothesis)
|
||||
stack.push(_Hypothesis(0.1))
|
||||
stack.push(_Hypothesis(0.3))
|
||||
|
||||
# assert
|
||||
self.assertFalse(poor_hypothesis in stack)
|
||||
|
||||
def test_push_removes_hypotheses_that_fall_below_beam_threshold(self):
|
||||
# arrange
|
||||
stack = _Stack(3, 0.5)
|
||||
poor_hypothesis = _Hypothesis(0.01)
|
||||
worse_hypothesis = _Hypothesis(0.009)
|
||||
|
||||
# act
|
||||
stack.push(poor_hypothesis)
|
||||
stack.push(worse_hypothesis)
|
||||
stack.push(_Hypothesis(0.9)) # greatly superior hypothesis
|
||||
|
||||
# assert
|
||||
self.assertFalse(poor_hypothesis in stack)
|
||||
self.assertFalse(worse_hypothesis in stack)
|
||||
|
||||
def test_push_does_not_add_hypothesis_that_falls_below_beam_threshold(self):
|
||||
# arrange
|
||||
stack = _Stack(3, 0.5)
|
||||
poor_hypothesis = _Hypothesis(0.01)
|
||||
|
||||
# act
|
||||
stack.push(_Hypothesis(0.9)) # greatly superior hypothesis
|
||||
stack.push(poor_hypothesis)
|
||||
|
||||
# assert
|
||||
self.assertFalse(poor_hypothesis in stack)
|
||||
|
||||
def test_best_returns_the_best_hypothesis(self):
|
||||
# arrange
|
||||
stack = _Stack(3)
|
||||
best_hypothesis = _Hypothesis(0.99)
|
||||
|
||||
# act
|
||||
stack.push(_Hypothesis(0.0))
|
||||
stack.push(best_hypothesis)
|
||||
stack.push(_Hypothesis(0.5))
|
||||
|
||||
# assert
|
||||
self.assertEqual(stack.best(), best_hypothesis)
|
||||
|
||||
def test_best_returns_none_when_stack_is_empty(self):
|
||||
# arrange
|
||||
stack = _Stack(3)
|
||||
|
||||
# assert
|
||||
self.assertEqual(stack.best(), None)
|
||||
47
venv/lib/python3.7/site-packages/nltk/test/unit/utils.py
Normal file
47
venv/lib/python3.7/site-packages/nltk/test/unit/utils.py
Normal file
@@ -0,0 +1,47 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
from unittest import TestCase
|
||||
from functools import wraps
|
||||
from nose.plugins.skip import SkipTest
|
||||
from nltk.util import py26
|
||||
|
||||
|
||||
def skip(reason):
|
||||
"""
|
||||
Unconditionally skip a test.
|
||||
"""
|
||||
|
||||
def decorator(test_item):
|
||||
is_test_class = isinstance(test_item, type) and issubclass(test_item, TestCase)
|
||||
|
||||
if is_test_class and py26():
|
||||
# Patch all test_ methods to raise SkipText exception.
|
||||
# This is necessary for Python 2.6 because its unittest
|
||||
# doesn't understand __unittest_skip__.
|
||||
for meth_name in (m for m in dir(test_item) if m.startswith('test_')):
|
||||
patched_method = skip(reason)(getattr(test_item, meth_name))
|
||||
setattr(test_item, meth_name, patched_method)
|
||||
|
||||
if not is_test_class:
|
||||
|
||||
@wraps(test_item)
|
||||
def skip_wrapper(*args, **kwargs):
|
||||
raise SkipTest(reason)
|
||||
|
||||
skip_wrapper.__name__ = test_item.__name__
|
||||
test_item = skip_wrapper
|
||||
|
||||
test_item.__unittest_skip__ = True
|
||||
test_item.__unittest_skip_why__ = reason
|
||||
return test_item
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def skipIf(condition, reason):
|
||||
"""
|
||||
Skip a test if the condition is true.
|
||||
"""
|
||||
if condition:
|
||||
return skip(reason)
|
||||
return lambda obj: obj
|
||||
Reference in New Issue
Block a user