Initial commit

This commit is contained in:
Senad Uka
2019-10-20 13:16:49 +02:00
commit 233066caf4
2099 changed files with 360824 additions and 0 deletions

View File

@@ -0,0 +1,135 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language Model Unit Tests
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import unittest
import six
from nltk import FreqDist
from nltk.lm import NgramCounter
from nltk.util import everygrams
class NgramCounterTests(unittest.TestCase):
"""Tests for NgramCounter that only involve lookup, no modification."""
@classmethod
def setUpClass(cls):
text = [list("abcd"), list("egdbe")]
cls.trigram_counter = NgramCounter(
(everygrams(sent, max_len=3) for sent in text)
)
cls.bigram_counter = NgramCounter(
(everygrams(sent, max_len=2) for sent in text)
)
def test_N(self):
self.assertEqual(self.bigram_counter.N(), 16)
self.assertEqual(self.trigram_counter.N(), 21)
def test_counter_len_changes_with_lookup(self):
self.assertEqual(len(self.bigram_counter), 2)
_ = self.bigram_counter[50]
self.assertEqual(len(self.bigram_counter), 3)
def test_ngram_order_access_unigrams(self):
self.assertEqual(self.bigram_counter[1], self.bigram_counter.unigrams)
def test_ngram_conditional_freqdist(self):
expected_trigram_contexts = [
("a", "b"),
("b", "c"),
("e", "g"),
("g", "d"),
("d", "b"),
]
expected_bigram_contexts = [("a",), ("b",), ("d",), ("e",), ("c",), ("g",)]
bigrams = self.trigram_counter[2]
trigrams = self.trigram_counter[3]
six.assertCountEqual(self, expected_bigram_contexts, bigrams.conditions())
six.assertCountEqual(self, expected_trigram_contexts, trigrams.conditions())
def test_bigram_counts_seen_ngrams(self):
b_given_a_count = 1
unk_given_b_count = 1
self.assertEqual(b_given_a_count, self.bigram_counter[["a"]]["b"])
self.assertEqual(unk_given_b_count, self.bigram_counter[["b"]]["c"])
def test_bigram_counts_unseen_ngrams(self):
z_given_b_count = 0
self.assertEqual(z_given_b_count, self.bigram_counter[["b"]]["z"])
def test_unigram_counts_seen_words(self):
expected_count_b = 2
self.assertEqual(expected_count_b, self.bigram_counter["b"])
def test_unigram_counts_completely_unseen_words(self):
unseen_count = 0
self.assertEqual(unseen_count, self.bigram_counter["z"])
class NgramCounterTrainingTests(unittest.TestCase):
def setUp(self):
self.counter = NgramCounter()
def test_empty_string(self):
test = NgramCounter("")
self.assertNotIn(2, test)
self.assertEqual(test[1], FreqDist())
def test_empty_list(self):
test = NgramCounter([])
self.assertNotIn(2, test)
self.assertEqual(test[1], FreqDist())
def test_None(self):
test = NgramCounter(None)
self.assertNotIn(2, test)
self.assertEqual(test[1], FreqDist())
def test_train_on_unigrams(self):
words = list("abcd")
counter = NgramCounter([[(w,) for w in words]])
self.assertFalse(bool(counter[3]))
self.assertFalse(bool(counter[2]))
six.assertCountEqual(self, words, counter[1].keys())
def test_train_on_illegal_sentences(self):
str_sent = ["Check", "this", "out", "!"]
list_sent = [["Check", "this"], ["this", "out"], ["out", "!"]]
with self.assertRaises(TypeError):
NgramCounter([str_sent])
with self.assertRaises(TypeError):
NgramCounter([list_sent])
def test_train_on_bigrams(self):
bigram_sent = [("a", "b"), ("c", "d")]
counter = NgramCounter([bigram_sent])
self.assertFalse(bool(counter[3]))
def test_train_on_mix(self):
mixed_sent = [("a", "b"), ("c", "d"), ("e", "f", "g"), ("h",)]
counter = NgramCounter([mixed_sent])
unigrams = ["h"]
bigram_contexts = [("a",), ("c",)]
trigram_contexts = [("e", "f")]
six.assertCountEqual(self, unigrams, counter[1].keys())
six.assertCountEqual(self, bigram_contexts, counter[2].keys())
six.assertCountEqual(self, trigram_contexts, counter[3].keys())

View File

@@ -0,0 +1,446 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language Model Unit Tests
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import division
import math
import sys
import unittest
from six import add_metaclass
from nltk.lm import (
Vocabulary,
MLE,
Lidstone,
Laplace,
WittenBellInterpolated,
KneserNeyInterpolated,
)
from nltk.lm.preprocessing import padded_everygrams
def _prepare_test_data(ngram_order):
return (
Vocabulary(["a", "b", "c", "d", "z", "<s>", "</s>"], unk_cutoff=1),
[
list(padded_everygrams(ngram_order, sent))
for sent in (list("abcd"), list("egadbe"))
],
)
class ParametrizeTestsMeta(type):
"""Metaclass for generating parametrized tests."""
def __new__(cls, name, bases, dct):
contexts = (
("a",),
("c",),
(u"<s>",),
("b",),
(u"<UNK>",),
("d",),
("e",),
("r",),
("w",),
)
for i, c in enumerate(contexts):
dct["test_sumto1_{0}".format(i)] = cls.add_sum_to_1_test(c)
scores = dct.get("score_tests", [])
for i, (word, context, expected_score) in enumerate(scores):
dct["test_score_{0}".format(i)] = cls.add_score_test(
word, context, expected_score
)
return super(ParametrizeTestsMeta, cls).__new__(cls, name, bases, dct)
@classmethod
def add_score_test(cls, word, context, expected_score):
if sys.version_info > (3, 5):
message = "word='{word}', context={context}"
else:
# Python 2 doesn't report the mismatched values if we pass a custom
# message, so we have to report them manually.
message = (
"{score} != {expected_score} within 4 places, "
"word='{word}', context={context}"
)
def test_method(self):
score = self.model.score(word, context)
self.assertAlmostEqual(
score, expected_score, msg=message.format(**locals()), places=4
)
return test_method
@classmethod
def add_sum_to_1_test(cls, context):
def test(self):
s = sum(self.model.score(w, context) for w in self.model.vocab)
self.assertAlmostEqual(s, 1.0, msg="The context is {}".format(context))
return test
@add_metaclass(ParametrizeTestsMeta)
class MleBigramTests(unittest.TestCase):
"""unit tests for MLENgramModel class"""
score_tests = [
("d", ["c"], 1),
# Unseen ngrams should yield 0
("d", ["e"], 0),
# Unigrams should also be 0
("z", None, 0),
# N unigrams = 14
# count('a') = 2
("a", None, 2.0 / 14),
# count('y') = 3
("y", None, 3.0 / 14),
]
def setUp(self):
vocab, training_text = _prepare_test_data(2)
self.model = MLE(2, vocabulary=vocab)
self.model.fit(training_text)
def test_logscore_zero_score(self):
# logscore of unseen ngrams should be -inf
logscore = self.model.logscore("d", ["e"])
self.assertTrue(math.isinf(logscore))
def test_entropy_perplexity_seen(self):
# ngrams seen during training
trained = [
("<s>", "a"),
("a", "b"),
("b", "<UNK>"),
("<UNK>", "a"),
("a", "d"),
("d", "</s>"),
]
# Ngram = Log score
# <s>, a = -1
# a, b = -1
# b, UNK = -1
# UNK, a = -1.585
# a, d = -1
# d, </s> = -1
# TOTAL logscores = -6.585
# - AVG logscores = 1.0975
H = 1.0975
perplexity = 2.1398
self.assertAlmostEqual(H, self.model.entropy(trained), places=4)
self.assertAlmostEqual(perplexity, self.model.perplexity(trained), places=4)
def test_entropy_perplexity_unseen(self):
# In MLE, even one unseen ngram should make entropy and perplexity infinite
untrained = [("<s>", "a"), ("a", "c"), ("c", "d"), ("d", "</s>")]
self.assertTrue(math.isinf(self.model.entropy(untrained)))
self.assertTrue(math.isinf(self.model.perplexity(untrained)))
def test_entropy_perplexity_unigrams(self):
# word = score, log score
# <s> = 0.1429, -2.8074
# a = 0.1429, -2.8074
# c = 0.0714, -3.8073
# UNK = 0.2143, -2.2224
# d = 0.1429, -2.8074
# c = 0.0714, -3.8073
# </s> = 0.1429, -2.8074
# TOTAL logscores = -21.6243
# - AVG logscores = 3.0095
H = 3.0095
perplexity = 8.0529
text = [("<s>",), ("a",), ("c",), ("-",), ("d",), ("c",), ("</s>",)]
self.assertAlmostEqual(H, self.model.entropy(text), places=4)
self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
@add_metaclass(ParametrizeTestsMeta)
class MleTrigramTests(unittest.TestCase):
"""MLE trigram model tests"""
score_tests = [
# count(d | b, c) = 1
# count(b, c) = 1
("d", ("b", "c"), 1),
# count(d | c) = 1
# count(c) = 1
("d", ["c"], 1),
# total number of tokens is 18, of which "a" occured 2 times
("a", None, 2.0 / 18),
# in vocabulary but unseen
("z", None, 0),
# out of vocabulary should use "UNK" score
("y", None, 3.0 / 18),
]
def setUp(self):
vocab, training_text = _prepare_test_data(3)
self.model = MLE(3, vocabulary=vocab)
self.model.fit(training_text)
@add_metaclass(ParametrizeTestsMeta)
class LidstoneBigramTests(unittest.TestCase):
"""unit tests for Lidstone class"""
score_tests = [
# count(d | c) = 1
# *count(d | c) = 1.1
# Count(w | c for w in vocab) = 1
# *Count(w | c for w in vocab) = 1.8
("d", ["c"], 1.1 / 1.8),
# Total unigrams: 14
# Vocab size: 8
# Denominator: 14 + 0.8 = 14.8
# count("a") = 2
# *count("a") = 2.1
("a", None, 2.1 / 14.8),
# in vocabulary but unseen
# count("z") = 0
# *count("z") = 0.1
("z", None, 0.1 / 14.8),
# out of vocabulary should use "UNK" score
# count("<UNK>") = 3
# *count("<UNK>") = 3.1
("y", None, 3.1 / 14.8),
]
def setUp(self):
vocab, training_text = _prepare_test_data(2)
self.model = Lidstone(0.1, 2, vocabulary=vocab)
self.model.fit(training_text)
def test_gamma(self):
self.assertEqual(0.1, self.model.gamma)
def test_entropy_perplexity(self):
text = [
("<s>", "a"),
("a", "c"),
("c", "<UNK>"),
("<UNK>", "d"),
("d", "c"),
("c", "</s>"),
]
# Unlike MLE this should be able to handle completely novel ngrams
# Ngram = score, log score
# <s>, a = 0.3929, -1.3479
# a, c = 0.0357, -4.8074
# c, UNK = 0.0(5), -4.1699
# UNK, d = 0.0263, -5.2479
# d, c = 0.0357, -4.8074
# c, </s> = 0.0(5), -4.1699
# TOTAL logscore: 24.5504
# - AVG logscore: 4.0917
H = 4.0917
perplexity = 17.0504
self.assertAlmostEqual(H, self.model.entropy(text), places=4)
self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
@add_metaclass(ParametrizeTestsMeta)
class LidstoneTrigramTests(unittest.TestCase):
score_tests = [
# Logic behind this is the same as for bigram model
("d", ["c"], 1.1 / 1.8),
# if we choose a word that hasn't appeared after (b, c)
("e", ["c"], 0.1 / 1.8),
# Trigram score now
("d", ["b", "c"], 1.1 / 1.8),
("e", ["b", "c"], 0.1 / 1.8),
]
def setUp(self):
vocab, training_text = _prepare_test_data(3)
self.model = Lidstone(0.1, 3, vocabulary=vocab)
self.model.fit(training_text)
@add_metaclass(ParametrizeTestsMeta)
class LaplaceBigramTests(unittest.TestCase):
"""unit tests for Laplace class"""
score_tests = [
# basic sanity-check:
# count(d | c) = 1
# *count(d | c) = 2
# Count(w | c for w in vocab) = 1
# *Count(w | c for w in vocab) = 9
("d", ["c"], 2.0 / 9),
# Total unigrams: 14
# Vocab size: 8
# Denominator: 14 + 8 = 22
# count("a") = 2
# *count("a") = 3
("a", None, 3.0 / 22),
# in vocabulary but unseen
# count("z") = 0
# *count("z") = 1
("z", None, 1.0 / 22),
# out of vocabulary should use "UNK" score
# count("<UNK>") = 3
# *count("<UNK>") = 4
("y", None, 4.0 / 22),
]
def setUp(self):
vocab, training_text = _prepare_test_data(2)
self.model = Laplace(2, vocabulary=vocab)
self.model.fit(training_text)
def test_gamma(self):
# Make sure the gamma is set to 1
self.assertEqual(1, self.model.gamma)
def test_entropy_perplexity(self):
text = [
("<s>", "a"),
("a", "c"),
("c", "<UNK>"),
("<UNK>", "d"),
("d", "c"),
("c", "</s>"),
]
# Unlike MLE this should be able to handle completely novel ngrams
# Ngram = score, log score
# <s>, a = 0.2, -2.3219
# a, c = 0.1, -3.3219
# c, UNK = 0.(1), -3.1699
# UNK, d = 0.(09), 3.4594
# d, c = 0.1 -3.3219
# c, </s> = 0.(1), -3.1699
# Total logscores: 18.7651
# - AVG logscores: 3.1275
H = 3.1275
perplexity = 8.7393
self.assertAlmostEqual(H, self.model.entropy(text), places=4)
self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
@add_metaclass(ParametrizeTestsMeta)
class WittenBellInterpolatedTrigramTests(unittest.TestCase):
def setUp(self):
vocab, training_text = _prepare_test_data(3)
self.model = WittenBellInterpolated(3, vocabulary=vocab)
self.model.fit(training_text)
score_tests = [
# For unigram scores by default revert to MLE
# Total unigrams: 18
# count('c'): 1
("c", None, 1.0 / 18),
# in vocabulary but unseen
# count("z") = 0
("z", None, 0.0 / 18),
# out of vocabulary should use "UNK" score
# count("<UNK>") = 3
("y", None, 3.0 / 18),
# gamma(['b']) = 0.1111
# mle.score('c', ['b']) = 0.5
# (1 - gamma) * mle + gamma * mle('c') ~= 0.45 + .3 / 18
("c", ["b"], (1 - 0.1111) * 0.5 + 0.1111 * 1 / 18),
# building on that, let's try 'a b c' as the trigram
# gamma(['a', 'b']) = 0.0667
# mle("c", ["a", "b"]) = 1
("c", ["a", "b"], (1 - 0.0667) + 0.0667 * ((1 - 0.1111) * 0.5 + 0.1111 / 18)),
]
@add_metaclass(ParametrizeTestsMeta)
class KneserNeyInterpolatedTrigramTests(unittest.TestCase):
def setUp(self):
vocab, training_text = _prepare_test_data(3)
self.model = KneserNeyInterpolated(3, vocabulary=vocab)
self.model.fit(training_text)
score_tests = [
# For unigram scores revert to uniform
# Vocab size: 8
# count('c'): 1
("c", None, 1.0 / 8),
# in vocabulary but unseen, still uses uniform
("z", None, 1 / 8),
# out of vocabulary should use "UNK" score, i.e. again uniform
("y", None, 1.0 / 8),
# alpha = count('bc') - discount = 1 - 0.1 = 0.9
# gamma(['b']) = discount * number of unique words that follow ['b'] = 0.1 * 2
# normalizer = total number of bigrams with this context = 2
# the final should be: (alpha + gamma * unigram_score("c"))
("c", ["b"], (0.9 + 0.2 * (1 / 8)) / 2),
# building on that, let's try 'a b c' as the trigram
# alpha = count('abc') - discount = 1 - 0.1 = 0.9
# gamma(['a', 'b']) = 0.1 * 1
# normalizer = total number of trigrams with prefix "ab" = 1 => we can ignore it!
("c", ["a", "b"], 0.9 + 0.1 * ((0.9 + 0.2 * (1 / 8)) / 2)),
]
class NgramModelTextGenerationTests(unittest.TestCase):
"""Using MLE estimator, generate some text."""
def setUp(self):
vocab, training_text = _prepare_test_data(3)
self.model = MLE(3, vocabulary=vocab)
self.model.fit(training_text)
def test_generate_one_no_context(self):
self.assertEqual(self.model.generate(random_seed=3), "<UNK>")
def test_generate_one_limiting_context(self):
# We don't need random_seed for contexts with only one continuation
self.assertEqual(self.model.generate(text_seed=["c"]), "d")
self.assertEqual(self.model.generate(text_seed=["b", "c"]), "d")
self.assertEqual(self.model.generate(text_seed=["a", "c"]), "d")
def test_generate_one_varied_context(self):
# When context doesn't limit our options enough, seed the random choice
self.assertEqual(
self.model.generate(text_seed=("a", "<s>"), random_seed=2), "a"
)
def test_generate_cycle(self):
# Add a cycle to the model: bd -> b, db -> d
more_training_text = [list(padded_everygrams(self.model.order, list("bdbdbd")))]
self.model.fit(more_training_text)
# Test that we can escape the cycle
self.assertEqual(
self.model.generate(7, text_seed=("b", "d"), random_seed=5),
["b", "d", "b", "d", "b", "d", "</s>"],
)
def test_generate_with_text_seed(self):
self.assertEqual(
self.model.generate(5, text_seed=("<s>", "e"), random_seed=3),
["<UNK>", "a", "d", "b", "<UNK>"],
)
def test_generate_oov_text_seed(self):
self.assertEqual(
self.model.generate(text_seed=("aliens",), random_seed=3),
self.model.generate(text_seed=("<UNK>",), random_seed=3),
)
def test_generate_None_text_seed(self):
# should crash with type error when we try to look it up in vocabulary
with self.assertRaises(TypeError):
self.model.generate(text_seed=(None,))
# This will work
self.assertEqual(
self.model.generate(text_seed=None, random_seed=3),
self.model.generate(random_seed=3),
)

View File

@@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language Model Unit Tests
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import unittest
from nltk.lm.preprocessing import padded_everygram_pipeline
class TestPreprocessing(unittest.TestCase):
def test_padded_everygram_pipeline(self):
expected_train = [
[
("<s>",),
("a",),
("b",),
("c",),
("</s>",),
("<s>", "a"),
("a", "b"),
("b", "c"),
("c", "</s>"),
]
]
expected_vocab = ["<s>", "a", "b", "c", "</s>"]
train_data, vocab_data = padded_everygram_pipeline(2, [["a", "b", "c"]])
self.assertEqual([list(sent) for sent in train_data], expected_train)
self.assertEqual(list(vocab_data), expected_vocab)

View File

@@ -0,0 +1,141 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language Model Unit Tests
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import unittest
from collections import Counter
import six
from nltk.lm import Vocabulary
class NgramModelVocabularyTests(unittest.TestCase):
"""tests Vocabulary Class"""
@classmethod
def setUpClass(cls):
cls.vocab = Vocabulary(
["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"],
unk_cutoff=2,
)
def test_truthiness(self):
self.assertTrue(self.vocab)
def test_cutoff_value_set_correctly(self):
self.assertEqual(self.vocab.cutoff, 2)
def test_unable_to_change_cutoff(self):
with self.assertRaises(AttributeError):
self.vocab.cutoff = 3
def test_cutoff_setter_checks_value(self):
with self.assertRaises(ValueError) as exc_info:
Vocabulary("abc", unk_cutoff=0)
expected_error_msg = "Cutoff value cannot be less than 1. Got: 0"
self.assertEqual(expected_error_msg, str(exc_info.exception))
def test_counts_set_correctly(self):
self.assertEqual(self.vocab.counts["a"], 2)
self.assertEqual(self.vocab.counts["b"], 2)
self.assertEqual(self.vocab.counts["c"], 1)
def test_membership_check_respects_cutoff(self):
# a was seen 2 times, so it should be considered part of the vocabulary
self.assertTrue("a" in self.vocab)
# "c" was seen once, it shouldn't be considered part of the vocab
self.assertFalse("c" in self.vocab)
# "z" was never seen at all, also shouldn't be considered in the vocab
self.assertFalse("z" in self.vocab)
def test_vocab_len_respects_cutoff(self):
# Vocab size is the number of unique tokens that occur at least as often
# as the cutoff value, plus 1 to account for unknown words.
self.assertEqual(5, len(self.vocab))
def test_vocab_iter_respects_cutoff(self):
vocab_counts = ["a", "b", "c", "d", "e", "f", "g", "w", "z"]
vocab_items = ["a", "b", "d", "e", "<UNK>"]
six.assertCountEqual(self, vocab_counts, list(self.vocab.counts.keys()))
six.assertCountEqual(self, vocab_items, list(self.vocab))
def test_update_empty_vocab(self):
empty = Vocabulary(unk_cutoff=2)
self.assertEqual(len(empty), 0)
self.assertFalse(empty)
self.assertIn(empty.unk_label, empty)
empty.update(list("abcde"))
self.assertIn(empty.unk_label, empty)
def test_lookup(self):
self.assertEqual(self.vocab.lookup("a"), "a")
self.assertEqual(self.vocab.lookup("c"), "<UNK>")
def test_lookup_iterables(self):
self.assertEqual(self.vocab.lookup(["a", "b"]), ("a", "b"))
self.assertEqual(self.vocab.lookup(("a", "b")), ("a", "b"))
self.assertEqual(self.vocab.lookup(("a", "c")), ("a", "<UNK>"))
self.assertEqual(
self.vocab.lookup(map(str, range(3))), ("<UNK>", "<UNK>", "<UNK>")
)
def test_lookup_empty_iterables(self):
self.assertEqual(self.vocab.lookup(()), ())
self.assertEqual(self.vocab.lookup([]), ())
self.assertEqual(self.vocab.lookup(iter([])), ())
self.assertEqual(self.vocab.lookup(n for n in range(0, 0)), ())
def test_lookup_recursive(self):
self.assertEqual(
self.vocab.lookup([["a", "b"], ["a", "c"]]), (("a", "b"), ("a", "<UNK>"))
)
self.assertEqual(self.vocab.lookup([["a", "b"], "c"]), (("a", "b"), "<UNK>"))
self.assertEqual(self.vocab.lookup([[[[["a", "b"]]]]]), ((((("a", "b"),),),),))
def test_lookup_None(self):
with self.assertRaises(TypeError):
self.vocab.lookup(None)
with self.assertRaises(TypeError):
list(self.vocab.lookup([None, None]))
def test_lookup_int(self):
with self.assertRaises(TypeError):
self.vocab.lookup(1)
with self.assertRaises(TypeError):
list(self.vocab.lookup([1, 2]))
def test_lookup_empty_str(self):
self.assertEqual(self.vocab.lookup(""), "<UNK>")
def test_eqality(self):
v1 = Vocabulary(["a", "b", "c"], unk_cutoff=1)
v2 = Vocabulary(["a", "b", "c"], unk_cutoff=1)
v3 = Vocabulary(["a", "b", "c"], unk_cutoff=1, unk_label="blah")
v4 = Vocabulary(["a", "b"], unk_cutoff=1)
self.assertEqual(v1, v2)
self.assertNotEqual(v1, v3)
self.assertNotEqual(v1, v4)
def test_str(self):
self.assertEqual(
str(self.vocab),
("<Vocabulary with cutoff=2 " "unk_label='<UNK>' and 5 items>"),
)
def test_creation_with_counter(self):
self.assertEqual(
self.vocab,
Vocabulary(
Counter(
["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"]
),
unk_cutoff=2,
),
)

View File

@@ -0,0 +1,65 @@
# -*- coding: utf-8 -*-
"""
Unit tests for nltk.compat.
See also nltk/test/compat.doctest.
"""
from __future__ import absolute_import, unicode_literals
import unittest
from nltk.text import Text
from nltk.compat import PY3, python_2_unicode_compatible
def setup_module(module):
from nose import SkipTest
if PY3:
raise SkipTest("test_2x_compat is for testing nltk.compat under Python 2.x")
class TestTextTransliteration(unittest.TestCase):
txt = Text(["São", "Tomé", "and", "Príncipe"])
def test_repr(self):
self.assertEqual(repr(self.txt), br"<Text: S\xe3o Tom\xe9 and Pr\xedncipe...>")
def test_str(self):
self.assertEqual(str(self.txt), b"<Text: Sao Tome and Principe...>")
class TestFraction(unittest.TestCase):
def test_unnoramlize_fraction(self):
from fractions import Fraction as NativePythonFraction
from nltk.compat import Fraction as NLTKFraction
# The native fraction should throw a TypeError in Python < 3.5
with self.assertRaises(TypeError):
NativePythonFraction(0, 1000, _normalize=False)
# Using nltk.compat.Fraction in Python < 3.5
compat_frac = NLTKFraction(0, 1000, _normalize=False)
# The numerator and denominator does not change.
assert compat_frac.numerator == 0
assert compat_frac.denominator == 1000
# The floating point value remains normalized.
assert float(compat_frac) == 0.0
# Checks that the division is not divided by
# # by greatest common divisor (gcd).
six_twelve = NLTKFraction(6, 12, _normalize=False)
assert six_twelve.numerator == 6
assert six_twelve.denominator == 12
one_two = NLTKFraction(1, 2, _normalize=False)
assert one_two.numerator == 1
assert one_two.denominator == 2
# Checks against the native fraction.
six_twelve_original = NativePythonFraction(6, 12)
# Checks that rational values of one_two and six_twelve is the same.
assert float(one_two) == float(six_twelve) == float(six_twelve_original)
# Checks that the fraction does get normalized, even when
# _normalize == False when numerator is using native
# fractions.Fraction.from_float
assert NLTKFraction(3.142, _normalize=False) == NativePythonFraction(3.142)

View File

@@ -0,0 +1,78 @@
# -*- coding: utf-8 -*-
"""
Unit tests for nltk.metrics.aline
"""
from __future__ import unicode_literals
import unittest
from nltk.metrics import aline
class TestAline(unittest.TestCase):
"""
Test Aline algorithm for aligning phonetic sequences
"""
def test_aline(self):
result = aline.align('θin', 'tenwis')
expected = [
[('θ', 't'), ('i', 'e'), ('n', 'n'), ('-', 'w'), ('-', 'i'), ('-', 's')]
]
self.assertEqual(result, expected)
result = aline.align('jo', 'ʒə')
expected = [[('j', 'ʒ'), ('o', 'ə')]]
self.assertEqual(result, expected)
result = aline.align('pematesiweni', 'pematesewen')
expected = [
[
('p', 'p'),
('e', 'e'),
('m', 'm'),
('a', 'a'),
('t', 't'),
('e', 'e'),
('s', 's'),
('i', 'e'),
('w', 'w'),
('e', 'e'),
('n', 'n'),
('i', '-'),
]
]
self.assertEqual(result, expected)
result = aline.align('tuwθ', 'dentis')
expected = [
[
('t', 'd'),
('u', 'e'),
('w', '-'),
('-', 'n'),
('-', 't'),
('-', 'i'),
('θ', 's'),
]
]
self.assertEqual(result, expected)
def test_aline_delta(self):
"""
Test aline for computing the difference between two segments
"""
result = aline.delta('p', 'q')
expected = 20.0
self.assertEqual(result, expected)
result = aline.delta('a', 'A')
expected = 0.0
self.assertEqual(result, expected)

View File

@@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
"""
Tests for Brill tagger.
"""
import unittest
from nltk.tag import UnigramTagger, brill, brill_trainer
from nltk.tbl import Template
from nltk.corpus import treebank
from nltk.tbl import demo
class TestBrill(unittest.TestCase):
def test_pos_template(self):
train_sents = treebank.tagged_sents()[:1000]
tagger = UnigramTagger(train_sents)
trainer = brill_trainer.BrillTaggerTrainer(
tagger, [brill.Template(brill.Pos([-1]))]
)
brill_tagger = trainer.train(train_sents)
# Example from https://github.com/nltk/nltk/issues/769
result = brill_tagger.tag('This is a foo bar sentence'.split())
expected = [
('This', 'DT'),
('is', 'VBZ'),
('a', 'DT'),
('foo', None),
('bar', 'NN'),
('sentence', None),
]
self.assertEqual(result, expected)
@unittest.skip("Should be tested in __main__ of nltk.tbl.demo")
def test_brill_demo(self):
demo()

View File

@@ -0,0 +1,39 @@
import unittest
from nltk import ConditionalFreqDist, tokenize
class TestEmptyCondFreq(unittest.TestCase):
def test_tabulate(self):
empty = ConditionalFreqDist()
self.assertEqual(empty.conditions(),[])
try:
empty.tabulate(conditions="BUG") # nonexistent keys shouldn't be added
except:
pass
self.assertEqual(empty.conditions(), [])
def test_plot(self):
empty = ConditionalFreqDist()
self.assertEqual(empty.conditions(),[])
try:
empty.plot(conditions=["BUG"]) # nonexistent keys shouldn't be added
except:
pass
self.assertEqual(empty.conditions(),[])
def test_increment(self):
# make sure that we can still mutate cfd normally
text = "cow cat mouse cat tiger"
cfd = ConditionalFreqDist()
# create cfd with word length as condition
for word in tokenize.word_tokenize(text):
condition = len(word)
cfd[condition][word] += 1
self.assertEqual(cfd.conditions(), [3,5])
# incrementing previously unseen key is still possible
cfd[2]['hi'] += 1
self.assertEqual(set(cfd.conditions()),set([3,5,2])) # new condition added
self.assertEqual(cfd[2]['hi'], 1) # key's frequency incremented from 0 (unseen) to 1

View File

@@ -0,0 +1,49 @@
# -*- coding: utf-8 -*-
import unittest
import nltk
from nltk.grammar import CFG
class ChomskyNormalFormForCFGTest(unittest.TestCase):
def test_simple(self):
grammar = CFG.fromstring(
"""
S -> NP VP
PP -> P NP
NP -> Det N | NP PP P
VP -> V NP | VP PP
VP -> Det
Det -> 'a' | 'the'
N -> 'dog' | 'cat'
V -> 'chased' | 'sat'
P -> 'on' | 'in'
"""
)
self.assertFalse(grammar.is_flexible_chomsky_normal_form())
self.assertFalse(grammar.is_chomsky_normal_form())
grammar = grammar.chomsky_normal_form(flexible=True)
self.assertTrue(grammar.is_flexible_chomsky_normal_form())
self.assertFalse(grammar.is_chomsky_normal_form())
grammar2 = CFG.fromstring(
"""
S -> NP VP
NP -> VP N P
VP -> P
N -> 'dog' | 'cat'
P -> 'on' | 'in'
"""
)
self.assertFalse(grammar2.is_flexible_chomsky_normal_form())
self.assertFalse(grammar2.is_chomsky_normal_form())
grammar2 = grammar2.chomsky_normal_form()
self.assertTrue(grammar2.is_flexible_chomsky_normal_form())
self.assertTrue(grammar2.is_chomsky_normal_form())
def test_complex(self):
grammar = nltk.data.load('grammars/large_grammars/atis.cfg')
self.assertFalse(grammar.is_flexible_chomsky_normal_form())
self.assertFalse(grammar.is_chomsky_normal_form())
grammar = grammar.chomsky_normal_form(flexible=True)
self.assertTrue(grammar.is_flexible_chomsky_normal_form())
self.assertFalse(grammar.is_chomsky_normal_form())

View File

@@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import unittest
from nltk import RegexpParser
class TestChunkRule(unittest.TestCase):
def test_tag_pattern2re_pattern_quantifier(self):
"""Test for bug https://github.com/nltk/nltk/issues/1597
Ensures that curly bracket quantifiers can be used inside a chunk rule.
This type of quantifier has been used for the supplementary example
in http://www.nltk.org/book/ch07.html#exploring-text-corpora.
"""
sent = [
('The', 'AT'),
('September-October', 'NP'),
('term', 'NN'),
('jury', 'NN'),
('had', 'HVD'),
('been', 'BEN'),
('charged', 'VBN'),
('by', 'IN'),
('Fulton', 'NP-TL'),
('Superior', 'JJ-TL'),
('Court', 'NN-TL'),
('Judge', 'NN-TL'),
('Durwood', 'NP'),
('Pye', 'NP'),
('to', 'TO'),
('investigate', 'VB'),
('reports', 'NNS'),
('of', 'IN'),
('possible', 'JJ'),
('``', '``'),
('irregularities', 'NNS'),
("''", "''"),
('in', 'IN'),
('the', 'AT'),
('hard-fought', 'JJ'),
('primary', 'NN'),
('which', 'WDT'),
('was', 'BEDZ'),
('won', 'VBN'),
('by', 'IN'),
('Mayor-nominate', 'NN-TL'),
('Ivan', 'NP'),
('Allen', 'NP'),
('Jr.', 'NP'),
('.', '.'),
] # source: brown corpus
cp = RegexpParser('CHUNK: {<N.*>{4,}}')
tree = cp.parse(sent)
assert (
tree.pformat()
== """(S
The/AT
September-October/NP
term/NN
jury/NN
had/HVD
been/BEN
charged/VBN
by/IN
Fulton/NP-TL
Superior/JJ-TL
(CHUNK Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP)
to/TO
investigate/VB
reports/NNS
of/IN
possible/JJ
``/``
irregularities/NNS
''/''
in/IN
the/AT
hard-fought/JJ
primary/NN
which/WDT
was/BEDZ
won/VBN
by/IN
(CHUNK Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP)
./.)"""
)

View File

@@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-
"""
Unit tests for nltk.classify. See also: nltk/test/classify.doctest
"""
from __future__ import absolute_import
from nose import SkipTest
from nltk import classify
TRAIN = [
(dict(a=1, b=1, c=1), 'y'),
(dict(a=1, b=1, c=1), 'x'),
(dict(a=1, b=1, c=0), 'y'),
(dict(a=0, b=1, c=1), 'x'),
(dict(a=0, b=1, c=1), 'y'),
(dict(a=0, b=0, c=1), 'y'),
(dict(a=0, b=1, c=0), 'x'),
(dict(a=0, b=0, c=0), 'x'),
(dict(a=0, b=1, c=1), 'y'),
]
TEST = [
(dict(a=1, b=0, c=1)), # unseen
(dict(a=1, b=0, c=0)), # unseen
(dict(a=0, b=1, c=1)), # seen 3 times, labels=y,y,x
(dict(a=0, b=1, c=0)), # seen 1 time, label=x
]
RESULTS = [(0.16, 0.84), (0.46, 0.54), (0.41, 0.59), (0.76, 0.24)]
def assert_classifier_correct(algorithm):
try:
classifier = classify.MaxentClassifier.train(
TRAIN, algorithm, trace=0, max_iter=1000
)
except (LookupError, AttributeError) as e:
raise SkipTest(str(e))
for (px, py), featureset in zip(RESULTS, TEST):
pdist = classifier.prob_classify(featureset)
assert abs(pdist.prob('x') - px) < 1e-2, (pdist.prob('x'), px)
assert abs(pdist.prob('y') - py) < 1e-2, (pdist.prob('y'), py)
def test_megam():
assert_classifier_correct('MEGAM')
def test_tadm():
assert_classifier_correct('TADM')

View File

@@ -0,0 +1,159 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import unittest
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
## Test bigram counters with discontinuous bigrams and repeated words
_EPSILON = 1e-8
def close_enough(x, y):
"""Verify that two sequences of n-gram association values are within
_EPSILON of each other.
"""
for (x1, y1) in zip(x, y):
if x1[0] != y1[0] or abs(x1[1] - y1[1]) > _EPSILON:
return False
return True
class TestBigram(unittest.TestCase):
def test_bigram2(self):
sent = 'this this is is a a test test'.split()
b = BigramCollocationFinder.from_words(sent)
# python 2.6 does not have assertItemsEqual or assertListEqual
self.assertEqual(
sorted(b.ngram_fd.items()),
sorted(
[
(('a', 'a'), 1),
(('a', 'test'), 1),
(('is', 'a'), 1),
(('is', 'is'), 1),
(('test', 'test'), 1),
(('this', 'is'), 1),
(('this', 'this'), 1),
]
),
)
self.assertEqual(
sorted(b.word_fd.items()),
sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
)
self.assertTrue(
len(sent) == sum(b.word_fd.values()) == sum(b.ngram_fd.values()) + 1
)
self.assertTrue(
close_enough(
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
sorted(
[
(('a', 'a'), 1.0),
(('a', 'test'), 1.0),
(('is', 'a'), 1.0),
(('is', 'is'), 1.0),
(('test', 'test'), 1.0),
(('this', 'is'), 1.0),
(('this', 'this'), 1.0),
]
),
)
)
def test_bigram3(self):
sent = 'this this is is a a test test'.split()
b = BigramCollocationFinder.from_words(sent, window_size=3)
self.assertEqual(
sorted(b.ngram_fd.items()),
sorted(
[
(('a', 'test'), 3),
(('is', 'a'), 3),
(('this', 'is'), 3),
(('a', 'a'), 1),
(('is', 'is'), 1),
(('test', 'test'), 1),
(('this', 'this'), 1),
]
),
)
self.assertEqual(
sorted(b.word_fd.items()),
sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
)
self.assertTrue(
len(sent)
== sum(b.word_fd.values())
== (sum(b.ngram_fd.values()) + 2 + 1) / 2.0
)
self.assertTrue(
close_enough(
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
sorted(
[
(('a', 'test'), 1.584962500721156),
(('is', 'a'), 1.584962500721156),
(('this', 'is'), 1.584962500721156),
(('a', 'a'), 0.0),
(('is', 'is'), 0.0),
(('test', 'test'), 0.0),
(('this', 'this'), 0.0),
]
),
)
)
def test_bigram5(self):
sent = 'this this is is a a test test'.split()
b = BigramCollocationFinder.from_words(sent, window_size=5)
self.assertEqual(
sorted(b.ngram_fd.items()),
sorted(
[
(('a', 'test'), 4),
(('is', 'a'), 4),
(('this', 'is'), 4),
(('is', 'test'), 3),
(('this', 'a'), 3),
(('a', 'a'), 1),
(('is', 'is'), 1),
(('test', 'test'), 1),
(('this', 'this'), 1),
]
),
)
self.assertEqual(
sorted(b.word_fd.items()),
sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
)
self.assertTrue(
len(sent)
== sum(b.word_fd.values())
== (sum(b.ngram_fd.values()) + 4 + 3 + 2 + 1) / 4.0
)
self.assertTrue(
close_enough(
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
sorted(
[
(('a', 'test'), 1.0),
(('is', 'a'), 1.0),
(('this', 'is'), 1.0),
(('is', 'test'), 0.5849625007211562),
(('this', 'a'), 0.5849625007211562),
(('a', 'a'), -1.0),
(('is', 'is'), -1.0),
(('test', 'test'), -1.0),
(('this', 'this'), -1.0),
]
),
)
)

View File

@@ -0,0 +1,107 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import unittest
import contextlib
import sys
from nose import with_setup
from nltk.corpus import gutenberg
from nltk.text import Text
try:
from StringIO import StringIO
except ImportError as e:
from io import StringIO
@contextlib.contextmanager
def stdout_redirect(where):
sys.stdout = where
try:
yield where
finally:
sys.stdout = sys.__stdout__
class TestConcordance(unittest.TestCase):
"""Text constructed using: http://www.nltk.org/book/ch01.html"""
@classmethod
def setup_class(cls):
cls.corpus = gutenberg.words('melville-moby_dick.txt')
@classmethod
def teardown_class(cls):
pass
def setUp(self):
self.text = Text(TestConcordance.corpus)
self.query = "monstrous"
self.maxDiff = None
self.list_out = [
'ong the former , one was of a most monstrous size . ... This came towards us , ',
'ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r',
'll over with a heathenish array of monstrous clubs and spears . Some were thick',
'd as you gazed , and wondered what monstrous cannibal and savage could ever hav',
'that has survived the flood ; most monstrous and most mountainous ! That Himmal',
'they might scout at Moby Dick as a monstrous fable , or still worse and more de',
'th of Radney .\'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l',
'ing Scenes . In connexion with the monstrous pictures of whales , I am strongly',
'ere to enter upon those still more monstrous stories of them which are to be fo',
'ght have been rummaged out of this monstrous cabinet there is no telling . But ',
'of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u',
]
def tearDown(self):
pass
def test_concordance_list(self):
concordance_out = self.text.concordance_list(self.query)
self.assertEqual(self.list_out, [c.line for c in concordance_out])
def test_concordance_width(self):
list_out = [
"monstrous",
"monstrous",
"monstrous",
"monstrous",
"monstrous",
"monstrous",
"Monstrous",
"monstrous",
"monstrous",
"monstrous",
"monstrous",
]
concordance_out = self.text.concordance_list(self.query, width=0)
self.assertEqual(list_out, [c.query for c in concordance_out])
def test_concordance_lines(self):
concordance_out = self.text.concordance_list(self.query, lines=3)
self.assertEqual(self.list_out[:3], [c.line for c in concordance_out])
def test_concordance_print(self):
print_out = """Displaying 11 of 11 matches:
ong the former , one was of a most monstrous size . ... This came towards us ,
ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r
ll over with a heathenish array of monstrous clubs and spears . Some were thick
d as you gazed , and wondered what monstrous cannibal and savage could ever hav
that has survived the flood ; most monstrous and most mountainous ! That Himmal
they might scout at Moby Dick as a monstrous fable , or still worse and more de
th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l
ing Scenes . In connexion with the monstrous pictures of whales , I am strongly
ere to enter upon those still more monstrous stories of them which are to be fo
ght have been rummaged out of this monstrous cabinet there is no telling . But
of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u
"""
with stdout_redirect(StringIO()) as stdout:
self.text.concordance(self.query)
def strip_space(raw_str):
return raw_str.replace(" ", "")
self.assertEqual(strip_space(print_out), strip_space(stdout.getvalue()))

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,272 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import unittest
from nltk.corpus import (
sinica_treebank,
conll2007,
indian,
cess_cat,
cess_esp,
floresta,
ptb,
udhr,
) # mwa_ppdb
from nltk.compat import python_2_unicode_compatible
from nltk.tree import Tree
from nltk.test.unit.utils import skipIf
class TestUdhr(unittest.TestCase):
def test_words(self):
for name in udhr.fileids():
try:
words = list(udhr.words(name))
except AssertionError:
print(name)
raise
self.assertTrue(words)
def test_raw_unicode(self):
for name in udhr.fileids():
txt = udhr.raw(name)
assert not isinstance(txt, bytes), name
class TestIndian(unittest.TestCase):
def test_words(self):
words = indian.words()[:3]
self.assertEqual(words, ['মহিষের', 'সন্তান', ':'])
def test_tagged_words(self):
tagged_words = indian.tagged_words()[:3]
self.assertEqual(
tagged_words, [('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM')]
)
class TestCess(unittest.TestCase):
def test_catalan(self):
words = cess_cat.words()[:15]
txt = "El Tribunal_Suprem -Fpa- TS -Fpt- ha confirmat la condemna a quatre anys d' inhabilitació especial"
self.assertEqual(words, txt.split())
self.assertEqual(cess_cat.tagged_sents()[0][34][0], "càrrecs")
def test_esp(self):
words = cess_esp.words()[:15]
txt = "El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del"
self.assertEqual(words, txt.split())
self.assertEqual(cess_esp.words()[115], "años")
class TestFloresta(unittest.TestCase):
def test_words(self):
words = floresta.words()[:10]
txt = "Um revivalismo refrescante O 7_e_Meio é um ex-libris de a"
self.assertEqual(words, txt.split())
class TestSinicaTreebank(unittest.TestCase):
def test_sents(self):
first_3_sents = sinica_treebank.sents()[:3]
self.assertEqual(
first_3_sents, [[''], ['友情'], ['嘉珍', '', '', '住在', '同一條', '巷子']]
)
def test_parsed_sents(self):
parsed_sents = sinica_treebank.parsed_sents()[25]
self.assertEqual(
parsed_sents,
Tree(
'S',
[
Tree('NP', [Tree('Nba', ['嘉珍'])]),
Tree('V‧地', [Tree('VA11', ['不停']), Tree('DE', [''])]),
Tree('VA4', ['哭泣']),
],
),
)
class TestCoNLL2007(unittest.TestCase):
# Reading the CoNLL 2007 Dependency Treebanks
def test_sents(self):
sents = conll2007.sents('esp.train')[0]
self.assertEqual(
sents[:6], ['El', 'aumento', 'del', 'índice', 'de', 'desempleo']
)
def test_parsed_sents(self):
parsed_sents = conll2007.parsed_sents('esp.train')[0]
self.assertEqual(
parsed_sents.tree(),
Tree(
'fortaleció',
[
Tree(
'aumento',
[
'El',
Tree(
'del',
[
Tree(
'índice',
[
Tree(
'de',
[Tree('desempleo', ['estadounidense'])],
)
],
)
],
),
],
),
'hoy',
'considerablemente',
Tree(
'al',
[
Tree(
'euro',
[
Tree(
'cotizaba',
[
',',
'que',
Tree('a', [Tree('15.35', ['las', 'GMT'])]),
'se',
Tree(
'en',
[
Tree(
'mercado',
[
'el',
Tree('de', ['divisas']),
Tree('de', ['Fráncfort']),
],
)
],
),
Tree('a', ['0,9452_dólares']),
Tree(
'frente_a',
[
',',
Tree(
'0,9349_dólares',
[
'los',
Tree(
'de',
[
Tree(
'mañana',
['esta'],
)
],
),
],
),
],
),
],
)
],
)
],
),
'.',
],
),
)
@skipIf(not ptb.fileids(), "A full installation of the Penn Treebank is not available")
class TestPTB(unittest.TestCase):
def test_fileids(self):
self.assertEqual(
ptb.fileids()[:4],
[
'BROWN/CF/CF01.MRG',
'BROWN/CF/CF02.MRG',
'BROWN/CF/CF03.MRG',
'BROWN/CF/CF04.MRG',
],
)
def test_words(self):
self.assertEqual(
ptb.words('WSJ/00/WSJ_0003.MRG')[:7],
['A', 'form', 'of', 'asbestos', 'once', 'used', '*'],
)
def test_tagged_words(self):
self.assertEqual(
ptb.tagged_words('WSJ/00/WSJ_0003.MRG')[:3],
[('A', 'DT'), ('form', 'NN'), ('of', 'IN')],
)
def test_categories(self):
self.assertEqual(
ptb.categories(),
[
'adventure',
'belles_lettres',
'fiction',
'humor',
'lore',
'mystery',
'news',
'romance',
'science_fiction',
],
)
def test_news_fileids(self):
self.assertEqual(
ptb.fileids('news')[:3],
['WSJ/00/WSJ_0001.MRG', 'WSJ/00/WSJ_0002.MRG', 'WSJ/00/WSJ_0003.MRG'],
)
def test_category_words(self):
self.assertEqual(
ptb.words(categories=['humor', 'fiction'])[:6],
['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back'],
)
@unittest.skip("Skipping test for mwa_ppdb.")
class TestMWAPPDB(unittest.TestCase):
def test_fileids(self):
self.assertEqual(
mwa_ppdb.fileids(), ['ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs']
)
def test_entries(self):
self.assertEqual(
mwa_ppdb.entries()[:10],
[
('10/17/01', '17/10/2001'),
('102,70', '102.70'),
('13,53', '13.53'),
('3.2.5.3.2.1', '3.2.5.3.2.1.'),
('53,76', '53.76'),
('6.9.5', '6.9.5.'),
('7.7.6.3', '7.7.6.3.'),
('76,20', '76.20'),
('79,85', '79.85'),
('93,65', '93.65'),
],
)
# unload corpora
from nltk.corpus import teardown_module

View File

@@ -0,0 +1,49 @@
# -*- coding: utf-8 -*-
"""
Corpus View Regression Tests
"""
from __future__ import absolute_import, unicode_literals
import unittest
import nltk.data
from nltk.corpus.reader.util import (
StreamBackedCorpusView,
read_whitespace_block,
read_line_block,
)
class TestCorpusViews(unittest.TestCase):
linetok = nltk.LineTokenizer(blanklines='keep')
names = [
'corpora/inaugural/README', # A very short file (160 chars)
'corpora/inaugural/1793-Washington.txt', # A relatively short file (791 chars)
'corpora/inaugural/1909-Taft.txt', # A longer file (32k chars)
]
def data(self):
for name in self.names:
f = nltk.data.find(name)
with f.open() as fp:
file_data = fp.read().decode('utf8')
yield f, file_data
def test_correct_values(self):
# Check that corpus views produce the correct sequence of values.
for f, file_data in self.data():
v = StreamBackedCorpusView(f, read_whitespace_block)
self.assertEqual(list(v), file_data.split())
v = StreamBackedCorpusView(f, read_line_block)
self.assertEqual(list(v), self.linetok.tokenize(file_data))
def test_correct_length(self):
# Check that the corpus views report the correct lengths:
for f, file_data in self.data():
v = StreamBackedCorpusView(f, read_whitespace_block)
self.assertEqual(len(v), len(file_data.split()))
v = StreamBackedCorpusView(f, read_line_block)
self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))

View File

@@ -0,0 +1,22 @@
import unittest
import nltk.data
from nose.tools import assert_raises
class TestData(unittest.TestCase):
def test_find_raises_exception(self):
with assert_raises(LookupError) as context:
nltk.data.find('no_such_resource/foo')
assert type(context.exception) == LookupError, 'Unexpected exception raised'
def test_find_raises_exception_with_full_resource_name(self):
no_such_thing = 'no_such_thing/bar'
with assert_raises(LookupError) as context:
nltk.data.find(no_such_thing)
assert no_such_thing in str(
context.exception
), 'Exception message does not include full resource name'

View File

@@ -0,0 +1,142 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import unittest
from nltk.metrics.agreement import AnnotationTask
class TestDisagreement(unittest.TestCase):
'''
Class containing unit tests for nltk.metrics.agreement.Disagreement.
'''
def test_easy(self):
'''
Simple test, based on
https://github.com/foolswood/krippendorffs_alpha/raw/master/krippendorff.pdf.
'''
data = [('coder1', 'dress1', 'YES'),
('coder2', 'dress1', 'NO'),
('coder3', 'dress1', 'NO'),
('coder1', 'dress2', 'YES'),
('coder2', 'dress2', 'NO'),
('coder3', 'dress3', 'NO'),
]
annotation_task = AnnotationTask(data)
self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
def test_easy2(self):
'''
Same simple test with 1 rating removed.
Removal of that rating should not matter: K-Apha ignores items with
only 1 rating.
'''
data = [('coder1', 'dress1', 'YES'),
('coder2', 'dress1', 'NO'),
('coder3', 'dress1', 'NO'),
('coder1', 'dress2', 'YES'),
('coder2', 'dress2', 'NO'),
]
annotation_task = AnnotationTask(data)
self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
def test_advanced(self):
'''
More advanced test, based on
http://www.agreestat.com/research_papers/onkrippendorffalpha.pdf
'''
data = [('A', '1', '1'),
('B', '1', '1'),
('D', '1', '1'),
('A', '2', '2'),
('B', '2', '2'),
('C', '2', '3'),
('D', '2', '2'),
('A', '3', '3'),
('B', '3', '3'),
('C', '3', '3'),
('D', '3', '3'),
('A', '4', '3'),
('B', '4', '3'),
('C', '4', '3'),
('D', '4', '3'),
('A', '5', '2'),
('B', '5', '2'),
('C', '5', '2'),
('D', '5', '2'),
('A', '6', '1'),
('B', '6', '2'),
('C', '6', '3'),
('D', '6', '4'),
('A', '7', '4'),
('B', '7', '4'),
('C', '7', '4'),
('D', '7', '4'),
('A', '8', '1'),
('B', '8', '1'),
('C', '8', '2'),
('D', '8', '1'),
('A', '9', '2'),
('B', '9', '2'),
('C', '9', '2'),
('D', '9', '2'),
('B', '10', '5'),
('C', '10', '5'),
('D', '10', '5'),
('C', '11', '1'),
('D', '11', '1'),
('C', '12', '3'),
]
annotation_task = AnnotationTask(data)
self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
def test_advanced2(self):
'''
Same more advanced example, but with 1 rating removed.
Again, removal of that 1 rating shoudl not matter.
'''
data = [('A', '1', '1'),
('B', '1', '1'),
('D', '1', '1'),
('A', '2', '2'),
('B', '2', '2'),
('C', '2', '3'),
('D', '2', '2'),
('A', '3', '3'),
('B', '3', '3'),
('C', '3', '3'),
('D', '3', '3'),
('A', '4', '3'),
('B', '4', '3'),
('C', '4', '3'),
('D', '4', '3'),
('A', '5', '2'),
('B', '5', '2'),
('C', '5', '2'),
('D', '5', '2'),
('A', '6', '1'),
('B', '6', '2'),
('C', '6', '3'),
('D', '6', '4'),
('A', '7', '4'),
('B', '7', '4'),
('C', '7', '4'),
('D', '7', '4'),
('A', '8', '1'),
('B', '8', '1'),
('C', '8', '2'),
('D', '8', '1'),
('A', '9', '2'),
('B', '9', '2'),
('C', '9', '2'),
('D', '9', '2'),
('B', '10', '5'),
('C', '10', '5'),
('D', '10', '5'),
('C', '11', '1'),
('D', '11', '1'),
('C', '12', '3'),
]
annotation_task = AnnotationTask(data)
self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)

View File

@@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
from nltk.tag import hmm
def _wikipedia_example_hmm():
# Example from wikipedia
# (http://en.wikipedia.org/wiki/Forward%E2%80%93backward_algorithm)
states = ['rain', 'no rain']
symbols = ['umbrella', 'no umbrella']
A = [[0.7, 0.3], [0.3, 0.7]] # transition probabilities
B = [[0.9, 0.1], [0.2, 0.8]] # emission probabilities
pi = [0.5, 0.5] # initial probabilities
seq = ['umbrella', 'umbrella', 'no umbrella', 'umbrella', 'umbrella']
seq = list(zip(seq, [None] * len(seq)))
model = hmm._create_hmm_tagger(states, symbols, A, B, pi)
return model, states, symbols, seq
def test_forward_probability():
from numpy.testing import assert_array_almost_equal
# example from p. 385, Huang et al
model, states, symbols = hmm._market_hmm_example()
seq = [('up', None), ('up', None)]
expected = [[0.35, 0.02, 0.09], [0.1792, 0.0085, 0.0357]]
fp = 2 ** model._forward_probability(seq)
assert_array_almost_equal(fp, expected)
def test_forward_probability2():
from numpy.testing import assert_array_almost_equal
model, states, symbols, seq = _wikipedia_example_hmm()
fp = 2 ** model._forward_probability(seq)
# examples in wikipedia are normalized
fp = (fp.T / fp.sum(axis=1)).T
wikipedia_results = [
[0.8182, 0.1818],
[0.8834, 0.1166],
[0.1907, 0.8093],
[0.7308, 0.2692],
[0.8673, 0.1327],
]
assert_array_almost_equal(wikipedia_results, fp, 4)
def test_backward_probability():
from numpy.testing import assert_array_almost_equal
model, states, symbols, seq = _wikipedia_example_hmm()
bp = 2 ** model._backward_probability(seq)
# examples in wikipedia are normalized
bp = (bp.T / bp.sum(axis=1)).T
wikipedia_results = [
# Forward-backward algorithm doesn't need b0_5,
# so .backward_probability doesn't compute it.
# [0.6469, 0.3531],
[0.5923, 0.4077],
[0.3763, 0.6237],
[0.6533, 0.3467],
[0.6273, 0.3727],
[0.5, 0.5],
]
assert_array_almost_equal(wikipedia_results, bp, 4)
def setup_module(module):
from nose import SkipTest
try:
import numpy
except ImportError:
raise SkipTest("numpy is required for nltk.test.test_hmm")

View File

@@ -0,0 +1,237 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Twitter client
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Lorenzo Rubio <lrnzcig@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Regression tests for `json2csv()` and `json2csv_entities()` in Twitter
package.
"""
import os
import unittest
from six.moves import zip
from nltk.compat import TemporaryDirectory
from nltk.corpus import twitter_samples
from nltk.twitter.common import json2csv, json2csv_entities
def are_files_identical(filename1, filename2, debug=False):
"""
Compare two files, ignoring carriage returns.
"""
with open(filename1, "rb") as fileA:
with open(filename2, "rb") as fileB:
result = True
for lineA, lineB in zip(
sorted(fileA.readlines()), sorted(fileB.readlines())
):
if lineA.strip() != lineB.strip():
if debug:
print(
"Error while comparing files. "
+ "First difference at line below."
)
print("=> Output file line: {0}".format(lineA))
print("=> Refer. file line: {0}".format(lineB))
result = False
break
return result
class TestJSON2CSV(unittest.TestCase):
def setUp(self):
with open(twitter_samples.abspath("tweets.20150430-223406.json")) as infile:
self.infile = [next(infile) for x in range(100)]
infile.close()
self.msg = "Test and reference files are not the same"
self.subdir = os.path.join(os.path.dirname(__file__), 'files')
def tearDown(self):
return
def test_textoutput(self):
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.text.csv.ref')
with TemporaryDirectory() as tempdir:
outfn = os.path.join(tempdir, 'tweets.20150430-223406.text.csv')
json2csv(self.infile, outfn, ['text'], gzip_compress=False)
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_tweet_metadata(self):
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.tweet.csv.ref')
fields = [
'created_at',
'favorite_count',
'id',
'in_reply_to_status_id',
'in_reply_to_user_id',
'retweet_count',
'retweeted',
'text',
'truncated',
'user.id',
]
with TemporaryDirectory() as tempdir:
outfn = os.path.join(tempdir, 'tweets.20150430-223406.tweet.csv')
json2csv(self.infile, outfn, fields, gzip_compress=False)
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_user_metadata(self):
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.user.csv.ref')
fields = ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count']
with TemporaryDirectory() as tempdir:
outfn = os.path.join(tempdir, 'tweets.20150430-223406.user.csv')
json2csv(self.infile, outfn, fields, gzip_compress=False)
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_tweet_hashtag(self):
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.hashtag.csv.ref')
with TemporaryDirectory() as tempdir:
outfn = os.path.join(tempdir, 'tweets.20150430-223406.hashtag.csv')
json2csv_entities(
self.infile,
outfn,
['id', 'text'],
'hashtags',
['text'],
gzip_compress=False,
)
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_tweet_usermention(self):
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.usermention.csv.ref')
with TemporaryDirectory() as tempdir:
outfn = os.path.join(tempdir, 'tweets.20150430-223406.usermention.csv')
json2csv_entities(
self.infile,
outfn,
['id', 'text'],
'user_mentions',
['id', 'screen_name'],
gzip_compress=False,
)
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_tweet_media(self):
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.media.csv.ref')
with TemporaryDirectory() as tempdir:
outfn = os.path.join(tempdir, 'tweets.20150430-223406.media.csv')
json2csv_entities(
self.infile,
outfn,
['id'],
'media',
['media_url', 'url'],
gzip_compress=False,
)
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_tweet_url(self):
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.url.csv.ref')
with TemporaryDirectory() as tempdir:
outfn = os.path.join(tempdir, 'tweets.20150430-223406.url.csv')
json2csv_entities(
self.infile,
outfn,
['id'],
'urls',
['url', 'expanded_url'],
gzip_compress=False,
)
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_userurl(self):
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.userurl.csv.ref')
with TemporaryDirectory() as tempdir:
outfn = os.path.join(tempdir, 'tweets.20150430-223406.userurl.csv')
json2csv_entities(
self.infile,
outfn,
['id', 'screen_name'],
'user.urls',
['url', 'expanded_url'],
gzip_compress=False,
)
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_tweet_place(self):
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.place.csv.ref')
with TemporaryDirectory() as tempdir:
outfn = os.path.join(tempdir, 'tweets.20150430-223406.place.csv')
json2csv_entities(
self.infile,
outfn,
['id', 'text'],
'place',
['name', 'country'],
gzip_compress=False,
)
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_tweet_place_boundingbox(self):
ref_fn = os.path.join(
self.subdir, 'tweets.20150430-223406.placeboundingbox.csv.ref'
)
with TemporaryDirectory() as tempdir:
outfn = os.path.join(tempdir, 'tweets.20150430-223406.placeboundingbox.csv')
json2csv_entities(
self.infile,
outfn,
['id', 'name'],
'place.bounding_box',
['coordinates'],
gzip_compress=False,
)
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_retweet_original_tweet(self):
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.retweet.csv.ref')
with TemporaryDirectory() as tempdir:
outfn = os.path.join(tempdir, 'tweets.20150430-223406.retweet.csv')
json2csv_entities(
self.infile,
outfn,
['id'],
'retweeted_status',
[
'created_at',
'favorite_count',
'id',
'in_reply_to_status_id',
'in_reply_to_user_id',
'retweet_count',
'text',
'truncated',
'user.id',
],
gzip_compress=False,
)
self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg)
def test_file_is_wrong(self):
"""
Sanity check that file comparison is not giving false positives.
"""
ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.retweet.csv.ref')
with TemporaryDirectory() as tempdir:
outfn = os.path.join(tempdir, 'tweets.20150430-223406.text.csv')
json2csv(self.infile, outfn, ['text'], gzip_compress=False)
self.assertFalse(are_files_identical(outfn, ref_fn), msg=self.msg)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
from __future__ import print_function, unicode_literals
import unittest
from nltk.classify.naivebayes import NaiveBayesClassifier
class NaiveBayesClassifierTest(unittest.TestCase):
def test_simple(self):
training_features = [
({'nice': True, 'good': True}, 'positive'),
({'bad': True, 'mean': True}, 'negative'),
]
classifier = NaiveBayesClassifier.train(training_features)
result = classifier.prob_classify({'nice': True})
self.assertTrue(result.prob('positive') > result.prob('negative'))
self.assertEqual(result.max(), 'positive')
result = classifier.prob_classify({'bad': True})
self.assertTrue(result.prob('positive') < result.prob('negative'))
self.assertEqual(result.max(), 'negative')

View File

@@ -0,0 +1,28 @@
# -*- coding: utf-8 -*-
"""
Unit tests for nltk.corpus.nombank
"""
from __future__ import unicode_literals
import unittest
from nltk.corpus import nombank
# Load the nombank once.
nombank.nouns()
class NombankDemo(unittest.TestCase):
def test_numbers(self):
# No. of instances.
self.assertEqual(len(nombank.instances()), 114574)
# No. of rolesets
self.assertEqual(len(nombank.rolesets()), 5577)
# No. of nouns.
self.assertEqual(len(nombank.nouns()), 4704)
def test_instance(self):
self.assertEqual(nombank.instances()[0].roleset, 'perc-sign.01')
def test_framefiles_fileids(self):
self.assertEqual(len(nombank.fileids()), 4705)
self.assertTrue(all(fileid.endswith('.xml') for fileid in nombank.fileids()))

View File

@@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
"""
Tests for nltk.pos_tag
"""
from __future__ import unicode_literals
import unittest
from nltk import word_tokenize, pos_tag
class TestPosTag(unittest.TestCase):
def test_pos_tag_eng(self):
text = "John's big idea isn't all that bad."
expected_tagged = [
('John', 'NNP'),
("'s", 'POS'),
('big', 'JJ'),
('idea', 'NN'),
('is', 'VBZ'),
("n't", 'RB'),
('all', 'PDT'),
('that', 'DT'),
('bad', 'JJ'),
('.', '.'),
]
assert pos_tag(word_tokenize(text)) == expected_tagged
def test_pos_tag_eng_universal(self):
text = "John's big idea isn't all that bad."
expected_tagged = [
('John', 'NOUN'),
("'s", 'PRT'),
('big', 'ADJ'),
('idea', 'NOUN'),
('is', 'VERB'),
("n't", 'ADV'),
('all', 'DET'),
('that', 'DET'),
('bad', 'ADJ'),
('.', '.'),
]
assert pos_tag(word_tokenize(text), tagset='universal') == expected_tagged
def test_pos_tag_rus(self):
text = u"Илья оторопел и дважды перечитал бумажку."
expected_tagged = [
('Илья', 'S'),
('оторопел', 'V'),
('и', 'CONJ'),
('дважды', 'ADV'),
('перечитал', 'V'),
('бумажку', 'S'),
('.', 'NONLEX'),
]
assert pos_tag(word_tokenize(text), lang='rus') == expected_tagged
def test_pos_tag_rus_universal(self):
text = u"Илья оторопел и дважды перечитал бумажку."
expected_tagged = [
('Илья', 'NOUN'),
('оторопел', 'VERB'),
('и', 'CONJ'),
('дважды', 'ADV'),
('перечитал', 'VERB'),
('бумажку', 'NOUN'),
('.', '.'),
]
assert (
pos_tag(word_tokenize(text), tagset='universal', lang='rus')
== expected_tagged
)
def test_pos_tag_unknown_lang(self):
text = u"모르겠 습니 다"
self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang='kor')
# Test for default kwarg, `lang=None`
self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang=None)
def test_unspecified_lang(self):
# Tries to force the lang='eng' option.
text = u"모르겠 습니 다"
expected_but_wrong = [('모르겠', 'JJ'), ('습니', 'NNP'), ('', 'NN')]
assert pos_tag(word_tokenize(text)) == expected_but_wrong

View File

@@ -0,0 +1,92 @@
# -*- coding: utf-8 -*-
from __future__ import print_function, unicode_literals
import unittest
from nltk.corpus import rte as rte_corpus
from nltk.classify.rte_classify import RTEFeatureExtractor, rte_features, rte_classifier
expected_from_rte_feature_extration = """
alwayson => True
ne_hyp_extra => 0
ne_overlap => 1
neg_hyp => 0
neg_txt => 0
word_hyp_extra => 3
word_overlap => 3
alwayson => True
ne_hyp_extra => 0
ne_overlap => 1
neg_hyp => 0
neg_txt => 0
word_hyp_extra => 2
word_overlap => 1
alwayson => True
ne_hyp_extra => 1
ne_overlap => 1
neg_hyp => 0
neg_txt => 0
word_hyp_extra => 1
word_overlap => 2
alwayson => True
ne_hyp_extra => 1
ne_overlap => 0
neg_hyp => 0
neg_txt => 0
word_hyp_extra => 6
word_overlap => 2
alwayson => True
ne_hyp_extra => 1
ne_overlap => 0
neg_hyp => 0
neg_txt => 0
word_hyp_extra => 4
word_overlap => 0
alwayson => True
ne_hyp_extra => 1
ne_overlap => 0
neg_hyp => 0
neg_txt => 0
word_hyp_extra => 3
word_overlap => 1
"""
class RTEClassifierTest(unittest.TestCase):
# Test the feature extraction method.
def test_rte_feature_extraction(self):
pairs = rte_corpus.pairs(['rte1_dev.xml'])[:6]
test_output = [
"%-15s => %s" % (key, rte_features(pair)[key])
for pair in pairs
for key in sorted(rte_features(pair))
]
expected_output = expected_from_rte_feature_extration.strip().split('\n')
# Remove null strings.
expected_output = list(filter(None, expected_output))
self.assertEqual(test_output, expected_output)
# Test the RTEFeatureExtractor object.
def test_feature_extractor_object(self):
rtepair = rte_corpus.pairs(['rte3_dev.xml'])[33]
extractor = RTEFeatureExtractor(rtepair)
self.assertEqual(extractor.hyp_words, {'member', 'China', 'SCO.'})
self.assertEqual(extractor.overlap('word'), set())
self.assertEqual(extractor.overlap('ne'), {'China'})
self.assertEqual(extractor.hyp_extra('word'), {'member'})
# Test the RTE classifier training.
def test_rte_classification_without_megam(self):
clf = rte_classifier('IIS')
clf = rte_classifier('GIS')
@unittest.skip("Skipping tests with dependencies on MEGAM")
def test_rte_classification_with_megam(self):
nltk.config_megam('/usr/local/bin/megam')
clf = rte_classifier('megam')
clf = rte_classifier('BFGS')

View File

@@ -0,0 +1,140 @@
# -*- coding: utf-8 -*-
"""
The following test performs a random series of reads, seeks, and
tells, and checks that the results are consistent.
"""
from __future__ import absolute_import, unicode_literals
import random
import functools
from io import BytesIO
from nltk.corpus.reader import SeekableUnicodeStreamReader
def check_reader(unicode_string, encoding, n=1000):
bytestr = unicode_string.encode(encoding)
strlen = len(unicode_string)
stream = BytesIO(bytestr)
reader = SeekableUnicodeStreamReader(stream, encoding)
# Find all character positions
chars = []
while True:
pos = reader.tell()
chars.append((pos, reader.read(1)))
if chars[-1][1] == '':
break
# Find all strings
strings = dict((pos, '') for (pos, c) in chars)
for pos1, char in chars:
for pos2, _ in chars:
if pos2 <= pos1:
strings[pos2] += char
while True:
op = random.choice('tsrr')
# Check our position?
if op == 't': # tell
reader.tell()
# Perform a seek?
if op == 's': # seek
new_pos = random.choice([p for (p, c) in chars])
reader.seek(new_pos)
# Perform a read?
if op == 'r': # read
if random.random() < 0.3:
pos = reader.tell()
else:
pos = None
if random.random() < 0.2:
size = None
elif random.random() < 0.8:
size = random.randint(0, int(strlen / 6))
else:
size = random.randint(0, strlen + 20)
if random.random() < 0.8:
s = reader.read(size)
else:
s = reader.readline(size)
# check that everything's consistent
if pos is not None:
assert pos in strings
assert strings[pos].startswith(s)
n -= 1
if n == 0:
return 'passed'
# Call the randomized test function `check_reader` with a variety of
# input strings and encodings.
ENCODINGS = ['ascii', 'latin1', 'greek', 'hebrew', 'utf-16', 'utf-8']
STRINGS = [
"""
This is a test file.
It is fairly short.
""",
"This file can be encoded with latin1. \x83",
"""\
This is a test file.
Here's a blank line:
And here's some unicode: \xee \u0123 \uffe3
""",
"""\
This is a test file.
Unicode characters: \xf3 \u2222 \u3333\u4444 \u5555
""",
]
def test_reader():
for string in STRINGS:
for encoding in ENCODINGS:
try:
# skip strings that can't be encoded with the current encoding
string.encode(encoding)
yield check_reader, string, encoding
except UnicodeEncodeError:
pass
# nose shows the whole string arguments in a verbose mode; this is annoying,
# so large string test is separated.
LARGE_STRING = (
"""\
This is a larger file. It has some lines that are longer \
than 72 characters. It's got lots of repetition. Here's \
some unicode chars: \xee \u0123 \uffe3 \ueeee \u2345
How fun! Let's repeat it twenty times.
"""
* 10
)
def test_reader_on_large_string():
for encoding in ENCODINGS:
try:
# skip strings that can't be encoded with the current encoding
LARGE_STRING.encode(encoding)
def _check(encoding, n=1000):
check_reader(LARGE_STRING, encoding, n)
yield _check, encoding
except UnicodeEncodeError:
pass
def test_reader_stream_is_closed():
reader = SeekableUnicodeStreamReader(BytesIO(b''), 'ascii')
assert reader.stream.closed is False
reader.__del__()
assert reader.stream.closed is True
def teardown_module(module=None):
import gc
gc.collect()

View File

@@ -0,0 +1,116 @@
# -*- coding: utf-8 -*-
"""
Unit tests for Senna
"""
from __future__ import unicode_literals
from os import environ, path, sep
import logging
import unittest
from nltk.classify import Senna
from nltk.tag import SennaTagger, SennaChunkTagger, SennaNERTagger
# Set Senna executable path for tests if it is not specified as an environment variable
if 'SENNA' in environ:
SENNA_EXECUTABLE_PATH = path.normpath(environ['SENNA']) + sep
else:
SENNA_EXECUTABLE_PATH = '/usr/share/senna-v3.0'
senna_is_installed = path.exists(SENNA_EXECUTABLE_PATH)
@unittest.skipUnless(senna_is_installed, "Requires Senna executable")
class TestSennaPipeline(unittest.TestCase):
"""Unittest for nltk.classify.senna"""
def test_senna_pipeline(self):
"""Senna pipeline interface"""
pipeline = Senna(SENNA_EXECUTABLE_PATH, ['pos', 'chk', 'ner'])
sent = 'Dusseldorf is an international business center'.split()
result = [
(token['word'], token['chk'], token['ner'], token['pos'])
for token in pipeline.tag(sent)
]
expected = [
('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'),
('is', 'B-VP', 'O', 'VBZ'),
('an', 'B-NP', 'O', 'DT'),
('international', 'I-NP', 'O', 'JJ'),
('business', 'I-NP', 'O', 'NN'),
('center', 'I-NP', 'O', 'NN'),
]
self.assertEqual(result, expected)
@unittest.skipUnless(senna_is_installed, "Requires Senna executable")
class TestSennaTagger(unittest.TestCase):
"""Unittest for nltk.tag.senna"""
def test_senna_tagger(self):
tagger = SennaTagger(SENNA_EXECUTABLE_PATH)
result = tagger.tag('What is the airspeed of an unladen swallow ?'.split())
expected = [
('What', 'WP'),
('is', 'VBZ'),
('the', 'DT'),
('airspeed', 'NN'),
('of', 'IN'),
('an', 'DT'),
('unladen', 'NN'),
('swallow', 'NN'),
('?', '.'),
]
self.assertEqual(result, expected)
def test_senna_chunk_tagger(self):
chktagger = SennaChunkTagger(SENNA_EXECUTABLE_PATH)
result_1 = chktagger.tag('What is the airspeed of an unladen swallow ?'.split())
expected_1 = [
('What', 'B-NP'),
('is', 'B-VP'),
('the', 'B-NP'),
('airspeed', 'I-NP'),
('of', 'B-PP'),
('an', 'B-NP'),
('unladen', 'I-NP'),
('swallow', 'I-NP'),
('?', 'O'),
]
result_2 = list(chktagger.bio_to_chunks(result_1, chunk_type='NP'))
expected_2 = [
('What', '0'),
('the airspeed', '2-3'),
('an unladen swallow', '5-6-7'),
]
self.assertEqual(result_1, expected_1)
self.assertEqual(result_2, expected_2)
def test_senna_ner_tagger(self):
nertagger = SennaNERTagger(SENNA_EXECUTABLE_PATH)
result_1 = nertagger.tag('Shakespeare theatre was in London .'.split())
expected_1 = [
('Shakespeare', 'B-PER'),
('theatre', 'O'),
('was', 'O'),
('in', 'O'),
('London', 'B-LOC'),
('.', 'O'),
]
result_2 = nertagger.tag('UN headquarters are in NY , USA .'.split())
expected_2 = [
('UN', 'B-ORG'),
('headquarters', 'O'),
('are', 'O'),
('in', 'O'),
('NY', 'B-LOC'),
(',', 'O'),
('USA', 'B-LOC'),
('.', 'O'),
]
self.assertEqual(result_1, expected_1)
self.assertEqual(result_2, expected_2)

View File

@@ -0,0 +1,146 @@
# -*- coding: utf-8 -*-
from __future__ import print_function, unicode_literals
import os
import unittest
from contextlib import closing
from nltk import data
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
class SnowballTest(unittest.TestCase):
def test_arabic(self):
"""
this unit testing for test the snowball arabic light stemmer
this stemmer deals with prefixes and suffixes
"""
# Test where the ignore_stopwords=True.
ar_stemmer = SnowballStemmer("arabic", True)
assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
assert ar_stemmer.stem("العربية") == "عرب"
assert ar_stemmer.stem("فقالوا") == "قال"
assert ar_stemmer.stem("الطالبات") == "طالب"
assert ar_stemmer.stem("فالطالبات") == "طالب"
assert ar_stemmer.stem("والطالبات") == "طالب"
assert ar_stemmer.stem("الطالبون") == "طالب"
assert ar_stemmer.stem("اللذان") == "اللذان"
assert ar_stemmer.stem("من") == "من"
# Test where the ignore_stopwords=False.
ar_stemmer = SnowballStemmer("arabic", False)
assert ar_stemmer.stem("اللذان") == "اللذ" # this is a stop word
assert ar_stemmer.stem("الطالبات") == "طالب"
assert ar_stemmer.stem("الكلمات") == "كلم"
# test where create the arabic stemmer without given init value to ignore_stopwords
ar_stemmer = SnowballStemmer("arabic")
assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
assert ar_stemmer.stem("العربية") == "عرب"
assert ar_stemmer.stem("فقالوا") == "قال"
assert ar_stemmer.stem("الطالبات") == "طالب"
assert ar_stemmer.stem("الكلمات") == "كلم"
def test_russian(self):
stemmer_russian = SnowballStemmer("russian")
assert stemmer_russian.stem("авантненькая") == "авантненьк"
def test_german(self):
stemmer_german = SnowballStemmer("german")
stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)
assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'
assert stemmer_german.stem("keinen") == 'kein'
assert stemmer_german2.stem("keinen") == 'keinen'
def test_spanish(self):
stemmer = SnowballStemmer('spanish')
assert stemmer.stem("Visionado") == 'vision'
# The word 'algue' was raising an IndexError
assert stemmer.stem("algue") == 'algu'
def test_short_strings_bug(self):
stemmer = SnowballStemmer('english')
assert stemmer.stem("y's") == 'y'
class PorterTest(unittest.TestCase):
def _vocabulary(self):
with closing(
data.find('stemmers/porter_test/porter_vocabulary.txt').open(
encoding='utf-8'
)
) as fp:
return fp.read().splitlines()
def _test_against_expected_output(self, stemmer_mode, expected_stems):
stemmer = PorterStemmer(mode=stemmer_mode)
for word, true_stem in zip(self._vocabulary(), expected_stems):
our_stem = stemmer.stem(word)
assert our_stem == true_stem, (
"%s should stem to %s in %s mode but got %s"
% (word, true_stem, stemmer_mode, our_stem)
)
def test_vocabulary_martin_mode(self):
"""Tests all words from the test vocabulary provided by M Porter
The sample vocabulary and output were sourced from:
http://tartarus.org/martin/PorterStemmer/voc.txt
http://tartarus.org/martin/PorterStemmer/output.txt
and are linked to from the Porter Stemmer algorithm's homepage
at
http://tartarus.org/martin/PorterStemmer/
"""
with closing(
data.find('stemmers/porter_test/porter_martin_output.txt').open(
encoding='utf-8'
)
) as fp:
self._test_against_expected_output(
PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines()
)
def test_vocabulary_nltk_mode(self):
with closing(
data.find('stemmers/porter_test/porter_nltk_output.txt').open(
encoding='utf-8'
)
) as fp:
self._test_against_expected_output(
PorterStemmer.NLTK_EXTENSIONS, fp.read().splitlines()
)
def test_vocabulary_original_mode(self):
# The list of stems for this test was generated by taking the
# Martin-blessed stemmer from
# http://tartarus.org/martin/PorterStemmer/c.txt
# and removing all the --DEPARTURE-- sections from it and
# running it against Martin's test vocabulary.
with closing(
data.find('stemmers/porter_test/porter_original_output.txt').open(
encoding='utf-8'
)
) as fp:
self._test_against_expected_output(
PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines()
)
self._test_against_expected_output(
PorterStemmer.ORIGINAL_ALGORITHM,
data.find('stemmers/porter_test/porter_original_output.txt')
.open(encoding='utf-8')
.read()
.splitlines(),
)
def test_oed_bug(self):
"""Test for bug https://github.com/nltk/nltk/issues/1581
Ensures that 'oed' can be stemmed without throwing an error.
"""
assert PorterStemmer().stem('oed') == 'o'

View File

@@ -0,0 +1,30 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
def test_basic():
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
result = pos_tag(word_tokenize("John's big idea isn't all that bad."))
assert result == [
('John', 'NNP'),
("'s", 'POS'),
('big', 'JJ'),
('idea', 'NN'),
('is', 'VBZ'),
("n't", 'RB'),
('all', 'PDT'),
('that', 'DT'),
('bad', 'JJ'),
('.', '.'),
]
def setup_module(module):
from nose import SkipTest
try:
import numpy
except ImportError:
raise SkipTest("numpy is required for nltk.test.test_tag")

View File

@@ -0,0 +1,790 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Natural Language Toolkit: TGrep search
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Will Roberts <wildwilhelm@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
'''
Unit tests for nltk.tgrep.
'''
from __future__ import absolute_import, print_function, unicode_literals
import unittest
from six import b
from nltk.tree import ParentedTree
from nltk import tgrep
class TestSequenceFunctions(unittest.TestCase):
'''
Class containing unit tests for nltk.tgrep.
'''
def test_tokenize_simple(self):
'''
Simple test of tokenization.
'''
tokens = tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]')
self.assertEqual(
tokens,
[
'A',
'..',
'(',
'B',
'!',
'<',
'C',
'.',
'D',
')',
'|',
'!',
'[',
'<<',
'(',
'E',
',',
'F',
')',
'$',
'G',
']',
],
)
def test_tokenize_encoding(self):
'''
Test that tokenization handles bytes and strs the same way.
'''
self.assertEqual(
tgrep.tgrep_tokenize(b('A .. (B !< C . D) | ![<< (E , F) $ G]')),
tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]'),
)
def test_tokenize_link_types(self):
'''
Test tokenization of basic link types.
'''
self.assertEqual(tgrep.tgrep_tokenize('A<B'), ['A', '<', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A>B'), ['A', '>', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A<3B'), ['A', '<3', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A>3B'), ['A', '>3', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A<,B'), ['A', '<,', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A>,B'), ['A', '>,', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A<-3B'), ['A', '<-3', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A>-3B'), ['A', '>-3', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A<-B'), ['A', '<-', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A>-B'), ['A', '>-', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A<\'B'), ['A', '<\'', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A>\'B'), ['A', '>\'', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A<:B'), ['A', '<:', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A>:B'), ['A', '>:', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A<<B'), ['A', '<<', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A>>B'), ['A', '>>', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A<<,B'), ['A', '<<,', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A>>,B'), ['A', '>>,', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A<<\'B'), ['A', '<<\'', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A>>\'B'), ['A', '>>\'', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A<<:B'), ['A', '<<:', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A>>:B'), ['A', '>>:', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A.B'), ['A', '.', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A,B'), ['A', ',', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A..B'), ['A', '..', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A,,B'), ['A', ',,', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A$B'), ['A', '$', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A$.B'), ['A', '$.', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A$,B'), ['A', '$,', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A$..B'), ['A', '$..', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A$,,B'), ['A', '$,,', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!<B'), ['A', '!', '<', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!>B'), ['A', '!', '>', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!<3B'), ['A', '!', '<3', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!>3B'), ['A', '!', '>3', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!<,B'), ['A', '!', '<,', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!>,B'), ['A', '!', '>,', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!<-3B'), ['A', '!', '<-3', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!>-3B'), ['A', '!', '>-3', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!<-B'), ['A', '!', '<-', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!>-B'), ['A', '!', '>-', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!<\'B'), ['A', '!', '<\'', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!>\'B'), ['A', '!', '>\'', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!<:B'), ['A', '!', '<:', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!>:B'), ['A', '!', '>:', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!<<B'), ['A', '!', '<<', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!>>B'), ['A', '!', '>>', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!<<,B'), ['A', '!', '<<,', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!>>,B'), ['A', '!', '>>,', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!<<\'B'), ['A', '!', '<<\'', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!>>\'B'), ['A', '!', '>>\'', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!<<:B'), ['A', '!', '<<:', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!>>:B'), ['A', '!', '>>:', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!.B'), ['A', '!', '.', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!,B'), ['A', '!', ',', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!..B'), ['A', '!', '..', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!,,B'), ['A', '!', ',,', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!$B'), ['A', '!', '$', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!$.B'), ['A', '!', '$.', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!$,B'), ['A', '!', '$,', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!$..B'), ['A', '!', '$..', 'B'])
self.assertEqual(tgrep.tgrep_tokenize('A!$,,B'), ['A', '!', '$,,', 'B'])
def test_tokenize_examples(self):
'''
Test tokenization of the TGrep2 manual example patterns.
'''
self.assertEqual(tgrep.tgrep_tokenize('NP < PP'), ['NP', '<', 'PP'])
self.assertEqual(tgrep.tgrep_tokenize('/^NP/'), ['/^NP/'])
self.assertEqual(
tgrep.tgrep_tokenize('NP << PP . VP'), ['NP', '<<', 'PP', '.', 'VP']
)
self.assertEqual(
tgrep.tgrep_tokenize('NP << PP | . VP'), ['NP', '<<', 'PP', '|', '.', 'VP']
)
self.assertEqual(
tgrep.tgrep_tokenize('NP !<< PP [> NP | >> VP]'),
['NP', '!', '<<', 'PP', '[', '>', 'NP', '|', '>>', 'VP', ']'],
)
self.assertEqual(
tgrep.tgrep_tokenize('NP << (PP . VP)'),
['NP', '<<', '(', 'PP', '.', 'VP', ')'],
)
self.assertEqual(
tgrep.tgrep_tokenize('NP <\' (PP <, (IN < on))'),
['NP', '<\'', '(', 'PP', '<,', '(', 'IN', '<', 'on', ')', ')'],
)
self.assertEqual(
tgrep.tgrep_tokenize('S < (A < B) < C'),
['S', '<', '(', 'A', '<', 'B', ')', '<', 'C'],
)
self.assertEqual(
tgrep.tgrep_tokenize('S < ((A < B) < C)'),
['S', '<', '(', '(', 'A', '<', 'B', ')', '<', 'C', ')'],
)
self.assertEqual(
tgrep.tgrep_tokenize('S < (A < B < C)'),
['S', '<', '(', 'A', '<', 'B', '<', 'C', ')'],
)
self.assertEqual(tgrep.tgrep_tokenize('A<B&.C'), ['A', '<', 'B', '&', '.', 'C'])
def test_tokenize_quoting(self):
'''
Test tokenization of quoting.
'''
self.assertEqual(
tgrep.tgrep_tokenize('"A<<:B"<<:"A $.. B"<"A>3B"<C'),
['"A<<:B"', '<<:', '"A $.. B"', '<', '"A>3B"', '<', 'C'],
)
def test_tokenize_nodenames(self):
'''
Test tokenization of node names.
'''
self.assertEqual(tgrep.tgrep_tokenize('Robert'), ['Robert'])
self.assertEqual(tgrep.tgrep_tokenize('/^[Bb]ob/'), ['/^[Bb]ob/'])
self.assertEqual(tgrep.tgrep_tokenize('*'), ['*'])
self.assertEqual(tgrep.tgrep_tokenize('__'), ['__'])
# test tokenization of NLTK tree position syntax
self.assertEqual(tgrep.tgrep_tokenize('N()'), ['N(', ')'])
self.assertEqual(tgrep.tgrep_tokenize('N(0,)'), ['N(', '0', ',', ')'])
self.assertEqual(tgrep.tgrep_tokenize('N(0,0)'), ['N(', '0', ',', '0', ')'])
self.assertEqual(
tgrep.tgrep_tokenize('N(0,0,)'), ['N(', '0', ',', '0', ',', ')']
)
def test_tokenize_macros(self):
'''
Test tokenization of macro definitions.
'''
self.assertEqual(
tgrep.tgrep_tokenize(
'@ NP /^NP/;\n@ NN /^NN/;\n@NP [!< NP | < @NN] !$.. @NN'
),
[
'@',
'NP',
'/^NP/',
';',
'@',
'NN',
'/^NN/',
';',
'@NP',
'[',
'!',
'<',
'NP',
'|',
'<',
'@NN',
']',
'!',
'$..',
'@NN',
],
)
def test_node_simple(self):
'''
Test a simple use of tgrep for finding nodes matching a given
pattern.
'''
tree = ParentedTree.fromstring(
'(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))'
)
self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])), [[(0, 2), (2, 1)]])
self.assertEqual(
list(tgrep.tgrep_nodes('NN', [tree])), [[tree[0, 2], tree[2, 1]]]
)
self.assertEqual(
list(tgrep.tgrep_positions('NN|JJ', [tree])), [[(0, 1), (0, 2), (2, 1)]]
)
def test_node_printing(self):
'''Test that the tgrep print operator ' is properly ignored.'''
tree = ParentedTree.fromstring('(S (n x) (N x))')
self.assertEqual(
list(tgrep.tgrep_positions('N', [tree])),
list(tgrep.tgrep_positions('\'N', [tree])),
)
self.assertEqual(
list(tgrep.tgrep_positions('/[Nn]/', [tree])),
list(tgrep.tgrep_positions('\'/[Nn]/', [tree])),
)
def test_node_encoding(self):
'''
Test that tgrep search strings handles bytes and strs the same
way.
'''
tree = ParentedTree.fromstring(
'(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))'
)
self.assertEqual(
list(tgrep.tgrep_positions(b('NN'), [tree])),
list(tgrep.tgrep_positions('NN', [tree])),
)
self.assertEqual(
list(tgrep.tgrep_nodes(b('NN'), [tree])),
list(tgrep.tgrep_nodes('NN', [tree])),
)
self.assertEqual(
list(tgrep.tgrep_positions(b('NN|JJ'), [tree])),
list(tgrep.tgrep_positions('NN|JJ', [tree])),
)
def test_node_nocase(self):
'''
Test selecting nodes using case insensitive node names.
'''
tree = ParentedTree.fromstring('(S (n x) (N x))')
self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]])
def test_node_quoted(self):
'''
Test selecting nodes using quoted node names.
'''
tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))')
self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]])
self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]])
def test_node_regex(self):
'''
Test regex matching on nodes.
'''
tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))')
# This is a regular expression that matches any node whose
# name starts with NP, including NP-SBJ:
self.assertEqual(list(tgrep.tgrep_positions('/^NP/', [tree])), [[(0,), (1,)]])
def test_node_regex_2(self):
'''
Test regex matching on nodes.
'''
tree = ParentedTree.fromstring('(S (SBJ x) (SBJ1 x) (NP-SBJ x))')
self.assertEqual(list(tgrep.tgrep_positions('/^SBJ/', [tree])), [[(0,), (1,)]])
# This is a regular expression that matches any node whose
# name includes SBJ, including NP-SBJ:
self.assertEqual(
list(tgrep.tgrep_positions('/SBJ/', [tree])), [[(0,), (1,), (2,)]]
)
def test_node_tree_position(self):
'''
Test matching on nodes based on NLTK tree position.
'''
tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))')
# test all tree positions that are not leaves
leaf_positions = set(
tree.leaf_treeposition(x) for x in range(len(tree.leaves()))
)
tree_positions = [x for x in tree.treepositions() if x not in leaf_positions]
for position in tree_positions:
node_id = 'N{0}'.format(position)
tgrep_positions = list(tgrep.tgrep_positions(node_id, [tree]))
self.assertEqual(len(tgrep_positions[0]), 1)
self.assertEqual(tgrep_positions[0][0], position)
def test_node_noleaves(self):
'''
Test node name matching with the search_leaves flag set to False.
'''
tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
self.assertEqual(
list(tgrep.tgrep_positions('x', [tree])), [[(0, 0, 0), (1, 0, 0)]]
)
self.assertEqual(list(tgrep.tgrep_positions('x', [tree], False)), [[]])
def tests_rel_dominance(self):
'''
Test matching nodes based on dominance relations.
'''
tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
self.assertEqual(list(tgrep.tgrep_positions('* < T', [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions('* < T > S', [tree])), [[(0,)]])
self.assertEqual(
list(tgrep.tgrep_positions('* !< T', [tree])),
[[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]],
)
self.assertEqual(list(tgrep.tgrep_positions('* !< T > S', [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions('* > A', [tree])), [[(0, 0)]])
self.assertEqual(list(tgrep.tgrep_positions('* > B', [tree])), [[(1, 0)]])
self.assertEqual(
list(tgrep.tgrep_positions('* !> B', [tree])),
[[(), (0,), (0, 0), (0, 0, 0), (1,), (1, 0, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions('* !> B >> S', [tree])), [[(0,), (0, 0), (1,)]]
)
self.assertEqual(
list(tgrep.tgrep_positions('* >> S', [tree])),
[[(0,), (0, 0), (1,), (1, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions('* >>, S', [tree])), [[(0,), (0, 0)]]
)
self.assertEqual(
list(tgrep.tgrep_positions('* >>\' S', [tree])), [[(1,), (1, 0)]]
)
# Known issue:
# self.assertEqual(list(tgrep.tgrep_positions('* !>> S', [tree])),
# [[()]])
self.assertEqual(list(tgrep.tgrep_positions('* << T', [tree])), [[(), (0,)]])
self.assertEqual(list(tgrep.tgrep_positions('* <<\' T', [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions('* <<1 N', [tree])), [[(1,)]])
self.assertEqual(
list(tgrep.tgrep_positions('* !<< T', [tree])),
[[(0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]],
)
tree = ParentedTree.fromstring('(S (A (T x)) (B (T x) (N x )))')
self.assertEqual(list(tgrep.tgrep_positions('* <: T', [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions('* < T', [tree])), [[(0,), (1,)]])
self.assertEqual(
list(tgrep.tgrep_positions('* !<: T', [tree])),
[[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0), (1, 1), (1, 1, 0)]],
)
self.assertEqual(list(tgrep.tgrep_positions('* !<: T > S', [tree])), [[(1,)]])
tree = ParentedTree.fromstring('(S (T (A x) (B x)) (T (C x)))')
self.assertEqual(list(tgrep.tgrep_positions('* >: T', [tree])), [[(1, 0)]])
self.assertEqual(
list(tgrep.tgrep_positions('* !>: T', [tree])),
[[(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0, 0)]],
)
tree = ParentedTree.fromstring(
'(S (A (B (C (D (E (T x))))))' ' (A (B (C (D (E (T x))) (N x)))))'
)
self.assertEqual(
list(tgrep.tgrep_positions('* <<: T', [tree])),
[
[
(0,),
(0, 0),
(0, 0, 0),
(0, 0, 0, 0),
(0, 0, 0, 0, 0),
(1, 0, 0, 0),
(1, 0, 0, 0, 0),
]
],
)
self.assertEqual(
list(tgrep.tgrep_positions('* >>: A', [tree])),
[
[
(0, 0),
(0, 0, 0),
(0, 0, 0, 0),
(0, 0, 0, 0, 0),
(0, 0, 0, 0, 0, 0),
(1, 0),
(1, 0, 0),
]
],
)
def test_bad_operator(self):
'''
Test error handling of undefined tgrep operators.
'''
tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
self.assertRaises(
tgrep.TgrepException, list, tgrep.tgrep_positions('* >>> S', [tree])
)
def test_comments(self):
'''
Test that comments are correctly filtered out of tgrep search
strings.
'''
tree = ParentedTree.fromstring('(S (NN x) (NP x) (NN x))')
search1 = '''
@ NP /^NP/;
@ NN /^NN/;
@NN
'''
self.assertEqual(list(tgrep.tgrep_positions(search1, [tree])), [[(0,), (2,)]])
search2 = '''
# macros
@ NP /^NP/;
@ NN /^NN/;
# search string
@NN
'''
self.assertEqual(list(tgrep.tgrep_positions(search2, [tree])), [[(0,), (2,)]])
def test_rel_sister_nodes(self):
'''
Test matching sister nodes in a tree.
'''
tree = ParentedTree.fromstring('(S (A x) (B x) (C x))')
self.assertEqual(list(tgrep.tgrep_positions('* $. B', [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions('* $.. B', [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions('* $, B', [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions('* $,, B', [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions('* $ B', [tree])), [[(0,), (2,)]])
def tests_rel_indexed_children(self):
'''
Test matching nodes based on their index in their parent node.
'''
tree = ParentedTree.fromstring('(S (A x) (B x) (C x))')
self.assertEqual(list(tgrep.tgrep_positions('* >, S', [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions('* >1 S', [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions('* >2 S', [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions('* >3 S', [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions('* >\' S', [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions('* >-1 S', [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions('* >-2 S', [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions('* >-3 S', [tree])), [[(0,)]])
tree = ParentedTree.fromstring(
'(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) ' '(F (C x) (A x) (B x)))'
)
self.assertEqual(list(tgrep.tgrep_positions('* <, A', [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions('* <1 A', [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions('* <2 A', [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions('* <3 A', [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions('* <\' A', [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions('* <-1 A', [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions('* <-2 A', [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions('* <-3 A', [tree])), [[(0,)]])
def test_rel_precedence(self):
'''
Test matching nodes based on precedence relations.
'''
tree = ParentedTree.fromstring(
'(S (NP (NP (PP x)) (NP (AP x)))'
' (VP (AP (X (PP x)) (Y (AP x))))'
' (NP (RC (NP (AP x)))))'
)
self.assertEqual(
list(tgrep.tgrep_positions('* . X', [tree])), [[(0,), (0, 1), (0, 1, 0)]]
)
self.assertEqual(
list(tgrep.tgrep_positions('* . Y', [tree])), [[(1, 0, 0), (1, 0, 0, 0)]]
)
self.assertEqual(
list(tgrep.tgrep_positions('* .. X', [tree])),
[[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions('* .. Y', [tree])),
[[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0), (1, 0, 0, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions('* , X', [tree])), [[(1, 0, 1), (1, 0, 1, 0)]]
)
self.assertEqual(
list(tgrep.tgrep_positions('* , Y', [tree])),
[[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions('* ,, X', [tree])),
[[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions('* ,, Y', [tree])),
[[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
)
def test_examples(self):
'''
Test the Basic Examples from the TGrep2 manual.
'''
tree = ParentedTree.fromstring('(S (NP (AP x)) (NP (PP x)))')
# This matches any NP node that immediately dominates a PP:
self.assertEqual(list(tgrep.tgrep_positions('NP < PP', [tree])), [[(1,)]])
tree = ParentedTree.fromstring('(S (NP x) (VP x) (NP (PP x)) (VP x))')
# This matches an NP that dominates a PP and is immediately
# followed by a VP:
self.assertEqual(list(tgrep.tgrep_positions('NP << PP . VP', [tree])), [[(2,)]])
tree = ParentedTree.fromstring(
'(S (NP (AP x)) (NP (PP x)) ' '(NP (DET x) (NN x)) (VP x))'
)
# This matches an NP that dominates a PP or is immediately
# followed by a VP:
self.assertEqual(
list(tgrep.tgrep_positions('NP << PP | . VP', [tree])), [[(1,), (2,)]]
)
tree = ParentedTree.fromstring(
'(S (NP (NP (PP x)) (NP (AP x)))'
' (VP (AP (NP (PP x)) (NP (AP x))))'
' (NP (RC (NP (AP x)))))'
)
# This matches an NP that does not dominate a PP. Also, the NP
# must either have a parent that is an NP or be dominated by a
# VP:
self.assertEqual(
list(tgrep.tgrep_positions('NP !<< PP [> NP | >> VP]', [tree])),
[[(0, 1), (1, 0, 1)]],
)
tree = ParentedTree.fromstring(
'(S (NP (AP (PP x) (VP x))) ' '(NP (AP (PP x) (NP x))) (NP x))'
)
# This matches an NP that dominates a PP which itself is
# immediately followed by a VP. Note the use of parentheses to
# group ". VP" with the PP rather than with the NP:
self.assertEqual(
list(tgrep.tgrep_positions('NP << (PP . VP)', [tree])), [[(0,)]]
)
tree = ParentedTree.fromstring(
'(S (NP (DET a) (NN cat) (PP (IN on) (NP x)))'
' (NP (DET a) (NN cat) (PP (IN on) (NP x)) (PP x))'
' (NP x))'
)
# This matches an NP whose last child is a PP that begins with
# the preposition "on":
self.assertEqual(
list(tgrep.tgrep_positions('NP <\' (PP <, (IN < on))', [tree])), [[(0,)]]
)
tree = ParentedTree.fromstring(
'(S (S (C x) (A (B x))) (S (C x) (A x)) ' '(S (D x) (A (B x))))'
)
# The following pattern matches an S which has a child A and
# another child that is a C and that the A has a child B:
self.assertEqual(
list(tgrep.tgrep_positions('S < (A < B) < C', [tree])), [[(0,)]]
)
tree = ParentedTree.fromstring(
'(S (S (A (B x) (C x))) (S (S (C x) (A (B x)))))'
)
# However, this pattern means that S has child A and that A
# has children B and C:
self.assertEqual(
list(tgrep.tgrep_positions('S < ((A < B) < C)', [tree])), [[(0,)]]
)
# It is equivalent to this:
self.assertEqual(
list(tgrep.tgrep_positions('S < (A < B < C)', [tree])), [[(0,)]]
)
def test_use_macros(self):
'''
Test defining and using tgrep2 macros.
'''
tree = ParentedTree.fromstring(
'(VP (VB sold) (NP (DET the) '
'(NN heiress)) (NP (NN deed) (PREP to) '
'(NP (DET the) (NN school) (NN house))))'
)
self.assertEqual(
list(
tgrep.tgrep_positions(
'@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN', [tree]
)
),
[[(1,), (2, 2)]],
)
# use undefined macro @CNP
self.assertRaises(
tgrep.TgrepException,
list,
tgrep.tgrep_positions(
'@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN', [tree]
),
)
def test_tokenize_node_labels(self):
'''Test tokenization of labeled nodes.'''
self.assertEqual(
tgrep.tgrep_tokenize('S < @SBJ < (@VP < (@VB $.. @OBJ))'),
[
'S',
'<',
'@SBJ',
'<',
'(',
'@VP',
'<',
'(',
'@VB',
'$..',
'@OBJ',
')',
')',
],
)
self.assertEqual(
tgrep.tgrep_tokenize('S < @SBJ=s < (@VP=v < (@VB $.. @OBJ))'),
[
'S',
'<',
'@SBJ',
'=',
's',
'<',
'(',
'@VP',
'=',
'v',
'<',
'(',
'@VB',
'$..',
'@OBJ',
')',
')',
],
)
def test_tokenize_segmented_patterns(self):
'''Test tokenization of segmented patterns.'''
self.assertEqual(
tgrep.tgrep_tokenize('S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'),
[
'S',
'<',
'@SBJ',
'=',
's',
'<',
'(',
'@VP',
'=',
'v',
'<',
'(',
'@VB',
'$..',
'@OBJ',
')',
')',
':',
'=s',
'..',
'=v',
],
)
def test_labeled_nodes(self):
'''
Test labeled nodes.
Test case from Emily M. Bender.
'''
search = '''
# macros
@ SBJ /SBJ/;
@ VP /VP/;
@ VB /VB/;
@ VPoB /V[PB]/;
@ OBJ /OBJ/;
# 1 svo
S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'''
sent1 = ParentedTree.fromstring(
'(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))'
)
sent2 = ParentedTree.fromstring(
'(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))'
)
search_firsthalf = search.split('\n\n')[0] + 'S < @SBJ < (@VP < (@VB $.. @OBJ))'
search_rewrite = 'S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))'
self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0])
self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0])
self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0])
self.assertEqual(
list(tgrep.tgrep_positions(search, [sent1])),
list(tgrep.tgrep_positions(search_rewrite, [sent1])),
)
self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0])
self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0])
self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0])
self.assertEqual(
list(tgrep.tgrep_positions(search, [sent2])),
list(tgrep.tgrep_positions(search_rewrite, [sent2])),
)
def test_multiple_conjs(self):
'''
Test that multiple (3 or more) conjunctions of node relations are
handled properly.
'''
sent = ParentedTree.fromstring('((A (B b) (C c)) (A (B b) (C c) (D d)))')
# search = '(A < B < C < D)'
# search_tworels = '(A < B < C)'
self.assertEqual(
list(tgrep.tgrep_positions('(A < B < C < D)', [sent])), [[(1,)]]
)
self.assertEqual(
list(tgrep.tgrep_positions('(A < B < C)', [sent])), [[(0,), (1,)]]
)
def test_trailing_semicolon(self):
'''
Test that semicolons at the end of a tgrep2 search string won't
cause a parse failure.
'''
tree = ParentedTree.fromstring(
'(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))'
)
self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])), [[(0, 2), (2, 1)]])
self.assertEqual(list(tgrep.tgrep_positions('NN;', [tree])), [[(0, 2), (2, 1)]])
self.assertEqual(
list(tgrep.tgrep_positions('NN;;', [tree])), [[(0, 2), (2, 1)]]
)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,407 @@
# -*- coding: utf-8 -*-
"""
Unit tests for nltk.tokenize.
See also nltk/test/tokenize.doctest
"""
from __future__ import unicode_literals
import unittest
from nose import SkipTest
from nose.tools import assert_equal
from nltk.tokenize import (
punkt,
word_tokenize,
TweetTokenizer,
StanfordSegmenter,
TreebankWordTokenizer,
SyllableTokenizer,
)
class TestTokenize(unittest.TestCase):
def test_tweet_tokenizer(self):
"""
Test TweetTokenizer using words with special and accented characters.
"""
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
s9 = "@myke: Let's test these words: resumé España München français"
tokens = tokenizer.tokenize(s9)
expected = [
':',
"Let's",
'test',
'these',
'words',
':',
'resumé',
'España',
'München',
'français',
]
self.assertEqual(tokens, expected)
def test_sonority_sequencing_syllable_tokenizer(self):
"""
Test SyllableTokenizer tokenizer.
"""
tokenizer = SyllableTokenizer()
tokens = tokenizer.tokenize('justification')
self.assertEqual(tokens, ['jus', 'ti', 'fi', 'ca', 'tion'])
def test_stanford_segmenter_arabic(self):
"""
Test the Stanford Word Segmenter for Arabic (default config)
"""
try:
seg = StanfordSegmenter()
seg.default_config('ar')
sent = u'يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات'
segmented_sent = seg.segment(sent.split())
assert segmented_sent.split() == [
'يبحث',
'علم',
'الحاسوب',
'استخدام',
'الحوسبة',
'ب',
'جميع',
'اشكال',
'ها',
'ل',
'حل',
'المشكلات',
]
except LookupError as e:
raise SkipTest(str(e))
def test_stanford_segmenter_chinese(self):
"""
Test the Stanford Word Segmenter for Chinese (default config)
"""
try:
seg = StanfordSegmenter()
seg.default_config('zh')
sent = u"这是斯坦福中文分词器测试"
segmented_sent = seg.segment(sent.split())
assert segmented_sent.split() == ['', '', '斯坦福', '中文', '分词器', '测试']
except LookupError as e:
raise SkipTest(str(e))
def test_phone_tokenizer(self):
"""
Test a string that resembles a phone number but contains a newline
"""
# Should be recognized as a phone number, albeit one with multiple spaces
tokenizer = TweetTokenizer()
test1 = "(393) 928 -3010"
expected = ['(393) 928 -3010']
result = tokenizer.tokenize(test1)
self.assertEqual(result, expected)
# Due to newline, first three elements aren't part of a phone number;
# fourth is
test2 = "(393)\n928 -3010"
expected = ['(', '393', ')', "928 -3010"]
result = tokenizer.tokenize(test2)
self.assertEqual(result, expected)
def test_remove_handle(self):
"""
Test remove_handle() from casual.py with specially crafted edge cases
"""
tokenizer = TweetTokenizer(strip_handles=True)
# Simple example. Handles with just numbers should be allowed
test1 = "@twitter hello @twi_tter_. hi @12345 @123news"
expected = ['hello', '.', 'hi']
result = tokenizer.tokenize(test1)
self.assertEqual(result, expected)
# Handles are allowed to follow any of the following characters
test2 = "@n`@n~@n(@n)@n-@n=@n+@n\\@n|@n[@n]@n{@n}@n;@n:@n'@n\"@n/@n?@n.@n,@n<@n>@n @n\n@n ñ@n.ü@n.ç@n."
expected = [
'`',
'~',
'(',
')',
'-',
'=',
'+',
'\\',
'|',
'[',
']',
'{',
'}',
';',
':',
"'",
'"',
'/',
'?',
'.',
',',
'<',
'>',
'ñ',
'.',
'ü',
'.',
'ç',
'.',
]
result = tokenizer.tokenize(test2)
self.assertEqual(result, expected)
# Handles are NOT allowed to follow any of the following characters
test3 = "a@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@n"
expected = [
'a',
'@n',
'j',
'@n',
'z',
'@n',
'A',
'@n',
'L',
'@n',
'Z',
'@n',
'1',
'@n',
'4',
'@n',
'7',
'@n',
'9',
'@n',
'0',
'@n',
'_',
'@n',
'!',
'@n',
'@',
'@n',
'#',
'@n',
'$',
'@n',
'%',
'@n',
'&',
'@n',
'*',
'@n',
]
result = tokenizer.tokenize(test3)
self.assertEqual(result, expected)
# Handles are allowed to precede the following characters
test4 = "@n!a @n#a @n$a @n%a @n&a @n*a"
expected = ['!', 'a', '#', 'a', '$', 'a', '%', 'a', '&', 'a', '*', 'a']
result = tokenizer.tokenize(test4)
self.assertEqual(result, expected)
# Tests interactions with special symbols and multiple @
test5 = "@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@n"
expected = [
'!',
'@n',
'#',
'@n',
'$',
'@n',
'%',
'@n',
'&',
'@n',
'*',
'@n',
'@n',
'@n',
'@',
'@n',
'@n',
'@',
'@n',
'@n_',
'@n',
'@n7',
'@n',
'@nj',
'@n',
]
result = tokenizer.tokenize(test5)
self.assertEqual(result, expected)
# Tests that handles can have a max length of 20
test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmnopqrst1234 @abcdefghijklmnopqrst_ @abcdefghijklmnopqrstendofhandle"
expected = ['uvwxyz', '1234', '_', 'endofhandle']
result = tokenizer.tokenize(test6)
self.assertEqual(result, expected)
# Edge case where an @ comes directly after a long handle
test7 = "@abcdefghijklmnopqrstu@abcde @abcdefghijklmnopqrst@abcde @abcdefghijklmnopqrst_@abcde @abcdefghijklmnopqrst5@abcde"
expected = [
'u',
'@abcde',
'@abcdefghijklmnopqrst',
'@abcde',
'_',
'@abcde',
'5',
'@abcde',
]
result = tokenizer.tokenize(test7)
self.assertEqual(result, expected)
def test_treebank_span_tokenizer(self):
"""
Test TreebankWordTokenizer.span_tokenize function
"""
tokenizer = TreebankWordTokenizer()
# Test case in the docstring
test1 = "Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks)."
expected = [
(0, 4),
(5, 12),
(13, 17),
(18, 19),
(19, 23),
(24, 26),
(27, 30),
(31, 32),
(32, 36),
(36, 37),
(37, 38),
(40, 46),
(47, 48),
(48, 51),
(51, 52),
(53, 55),
(56, 59),
(60, 62),
(63, 68),
(69, 70),
(70, 76),
(76, 77),
(77, 78),
]
result = list(tokenizer.span_tokenize(test1))
self.assertEqual(result, expected)
# Test case with double quotation
test2 = "The DUP is similar to the \"religious right\" in the United States and takes a hardline stance on social issues"
expected = [
(0, 3),
(4, 7),
(8, 10),
(11, 18),
(19, 21),
(22, 25),
(26, 27),
(27, 36),
(37, 42),
(42, 43),
(44, 46),
(47, 50),
(51, 57),
(58, 64),
(65, 68),
(69, 74),
(75, 76),
(77, 85),
(86, 92),
(93, 95),
(96, 102),
(103, 109),
]
result = list(tokenizer.span_tokenize(test2))
self.assertEqual(result, expected)
# Test case with double qoutation as well as converted quotations
test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues"
expected = [
(0, 3),
(4, 7),
(8, 10),
(11, 18),
(19, 21),
(22, 25),
(26, 27),
(27, 36),
(37, 42),
(42, 43),
(44, 46),
(47, 50),
(51, 57),
(58, 64),
(65, 68),
(69, 74),
(75, 76),
(77, 79),
(79, 87),
(87, 89),
(90, 96),
(97, 99),
(100, 106),
(107, 113),
]
result = list(tokenizer.span_tokenize(test3))
self.assertEqual(result, expected)
def test_word_tokenize(self):
"""
Test word_tokenize function
"""
sentence = "The 'v', I've been fooled but I'll seek revenge."
expected = ['The', "'", 'v', "'", ',', 'I', "'ve", 'been', 'fooled',
'but', 'I', "'ll", 'seek', 'revenge', '.']
self.assertEqual(word_tokenize(sentence), expected)
sentence = "'v' 're'"
expected = ["'", 'v', "'", "'re", "'"]
self.assertEqual(word_tokenize(sentence), expected)
def test_punkt_pair_iter(self):
test_cases = [
('12', [('1', '2'), ('2', None)]),
('123', [('1', '2'), ('2', '3'), ('3', None)]),
('1234', [('1', '2'), ('2', '3'), ('3', '4'), ('4', None)]),
]
for (test_input, expected_output) in test_cases:
actual_output = [x for x in punkt._pair_iter(test_input)]
assert_equal(actual_output, expected_output)
def test_punkt_pair_iter_handles_stop_iteration_exception(self):
# test input to trigger StopIteration from next()
it = iter([])
# call method under test and produce a generator
gen = punkt._pair_iter(it)
# unpack generator, ensure that no error is raised
list(gen)
def test_punkt_tokenize_words_handles_stop_iteration_exception(self):
obj = punkt.PunktBaseClass()
class TestPunktTokenizeWordsMock:
def word_tokenize(self, s):
return iter([])
obj._lang_vars = TestPunktTokenizeWordsMock()
# unpack generator, ensure that no error is raised
list(obj._tokenize_words('test'))

View File

@@ -0,0 +1,181 @@
# -*- coding: utf-8 -*-
"""
Tests for static parts of Twitter package
"""
import os
import unittest
from nose import SkipTest
try:
import twython
except ImportError as e:
raise SkipTest("The twython library has not been installed.")
from nltk.twitter import Authenticate
class TestCredentials(unittest.TestCase):
"""
Tests that Twitter credentials information from file is handled correctly.
"""
def setUp(self):
self.subdir = os.path.join(os.path.dirname(__file__), 'files')
self.auth = Authenticate()
os.environ['TWITTER'] = 'twitter-files'
def test_environment(self):
"""
Test that environment variable has been read correctly.
"""
fn = os.path.basename(self.auth.creds_subdir)
self.assertEqual(fn, os.environ['TWITTER'])
def test_empty_subdir1(self):
"""
Setting subdir to empty path should raise an error.
"""
try:
self.auth.load_creds(subdir='')
# raises ValueError (zero length field name in format) for python 2.6
# OSError for the rest
except OSError:
pass
except ValueError:
pass
except Exception as e:
self.fail('Unexpected exception thrown: %s' % e)
else:
self.fail('OSError exception not thrown.')
def test_empty_subdir2(self):
"""
Setting subdir to `None` should raise an error.
"""
self.auth.creds_subdir = None
try:
self.auth.load_creds()
except ValueError:
pass
except Exception as e:
self.fail('Unexpected exception thrown: %s' % e)
else:
self.fail('ValueError exception not thrown.')
def test_missingdir(self):
"""
Setting subdir to nonexistent directory should raise an error.
"""
try:
self.auth.load_creds(subdir='/nosuchdir')
# raises ValueError (zero length field name in format) for python 2.6
# OSError for the rest
except OSError:
pass
except ValueError:
pass
except Exception as e:
self.fail('Unexpected exception thrown: %s' % e)
else:
self.fail('OSError exception not thrown.')
def test_missingfile1(self):
"""
Defaults for authentication will fail since 'credentials.txt' not
present in default subdir, as read from `os.environ['TWITTER']`.
"""
try:
self.auth.load_creds()
# raises ValueError (zero length field name in format) for python 2.6
# OSError for the rest
except OSError:
pass
except ValueError:
pass
except Exception as e:
self.fail('Unexpected exception thrown: %s' % e)
else:
self.fail('OSError exception not thrown.')
def test_missingfile2(self):
"""
Credentials file 'foobar' cannot be found in default subdir.
"""
try:
self.auth.load_creds(creds_file='foobar')
# raises ValueError (zero length field name in format) for python 2.6
# OSError for the rest
except OSError:
pass
except ValueError:
pass
except Exception as e:
self.fail('Unexpected exception thrown: %s' % e)
else:
self.fail('OSError exception not thrown.')
def test_incomplete_file(self):
"""
Credentials file 'bad_oauth1-1.txt' is incomplete
"""
try:
self.auth.load_creds(creds_file='bad_oauth1-1.txt', subdir=self.subdir)
except ValueError:
pass
except Exception as e:
self.fail('Unexpected exception thrown: %s' % e)
else:
self.fail('ValueError exception not thrown.')
def test_malformed_file1(self):
"""
First key in credentials file 'bad_oauth1-2.txt' is ill-formed
"""
try:
self.auth.load_creds(creds_file='bad_oauth1-2.txt', subdir=self.subdir)
except ValueError:
pass
except Exception as e:
self.fail('Unexpected exception thrown: %s' % e)
else:
self.fail('ValueError exception not thrown.')
def test_malformed_file2(self):
"""
First key in credentials file 'bad_oauth1-2.txt' is ill-formed
"""
try:
self.auth.load_creds(creds_file='bad_oauth1-3.txt', subdir=self.subdir)
except ValueError:
pass
except Exception as e:
self.fail('Unexpected exception thrown: %s' % e)
else:
self.fail('ValueError exception not thrown.')
def test_correct_path(self):
"""
Path to default credentials file is well-formed, given specified
subdir.
"""
self.auth.load_creds(subdir=self.subdir)
self.auth.creds_fullpath = os.path.join(self.subdir, self.auth.creds_file)
def test_correct_file1(self):
"""
Default credentials file is identified
"""
self.auth.load_creds(subdir=self.subdir)
self.assertEqual(self.auth.creds_file, 'credentials.txt')
def test_correct_file2(self):
"""
Default credentials file has been read correctluy
"""
oauth = self.auth.load_creds(subdir=self.subdir)
self.assertEqual(oauth['app_key'], 'a')
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,221 @@
# -*- coding: utf-8 -*-
"""
Unit tests for nltk.corpus.wordnet
See also nltk/test/wordnet.doctest
"""
from __future__ import unicode_literals
import collections
import os
import unittest
from nose import SkipTest
from nltk.corpus.reader.wordnet import WordNetCorpusReader
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic as wnic
from nltk.data import find as find_data
wn.ensure_loaded()
S = wn.synset
L = wn.lemma
class WordnNetDemo(unittest.TestCase):
def test_retrieve_synset(self):
move_synset = S('go.v.21')
self.assertEqual(move_synset.name(), "move.v.15")
self.assertEqual(move_synset.lemma_names(), ['move', 'go'])
self.assertEqual(
move_synset.definition(), "have a turn; make one's move in a game"
)
self.assertEqual(move_synset.examples(), ['Can I go now?'])
def test_retrieve_synsets(self):
self.assertEqual(sorted(wn.synsets('zap', pos='n')), [S('zap.n.01')])
self.assertEqual(
sorted(wn.synsets('zap', pos='v')),
[S('microwave.v.01'), S('nuke.v.01'), S('zap.v.01'), S('zap.v.02')],
)
def test_hyperhyponyms(self):
# Not every synset as hypernyms()
self.assertEqual(S('travel.v.01').hypernyms(), [])
self.assertEqual(S('travel.v.02').hypernyms(), [S('travel.v.03')])
self.assertEqual(S('travel.v.03').hypernyms(), [])
# Test hyper-/hyponyms.
self.assertEqual(S('breakfast.n.1').hypernyms(), [S('meal.n.01')])
first_five_meal_hypo = [
S('banquet.n.02'),
S('bite.n.04'),
S('breakfast.n.01'),
S('brunch.n.01'),
S('buffet.n.02'),
]
self.assertEqual(sorted(S('meal.n.1').hyponyms()[:5]), first_five_meal_hypo)
self.assertEqual(S('Austen.n.1').instance_hypernyms(), [S('writer.n.01')])
first_five_composer_hypo = [
S('ambrose.n.01'),
S('bach.n.01'),
S('barber.n.01'),
S('bartok.n.01'),
S('beethoven.n.01'),
]
self.assertEqual(
S('composer.n.1').instance_hyponyms()[:5], first_five_composer_hypo
)
# Test root hyper-/hyponyms
self.assertEqual(S('person.n.01').root_hypernyms(), [S('entity.n.01')])
self.assertEqual(S('sail.v.01').root_hypernyms(), [S('travel.v.01')])
self.assertEqual(
S('fall.v.12').root_hypernyms(), [S('act.v.01'), S('fall.v.17')]
)
def test_derivationally_related_forms(self):
# Test `derivationally_related_forms()`
self.assertEqual(
L('zap.v.03.nuke').derivationally_related_forms(),
[L('atomic_warhead.n.01.nuke')],
)
self.assertEqual(
L('zap.v.03.atomize').derivationally_related_forms(),
[L('atomization.n.02.atomization')],
)
self.assertEqual(
L('zap.v.03.atomise').derivationally_related_forms(),
[L('atomization.n.02.atomisation')],
)
self.assertEqual(L('zap.v.03.zap').derivationally_related_forms(), [])
def test_meronyms_holonyms(self):
# Test meronyms, holonyms.
self.assertEqual(
S('dog.n.01').member_holonyms(), [S('canis.n.01'), S('pack.n.06')]
)
self.assertEqual(S('dog.n.01').part_meronyms(), [S('flag.n.07')])
self.assertEqual(S('faculty.n.2').member_meronyms(), [S('professor.n.01')])
self.assertEqual(S('copilot.n.1').member_holonyms(), [S('crew.n.01')])
self.assertEqual(
S('table.n.2').part_meronyms(),
[S('leg.n.03'), S('tabletop.n.01'), S('tableware.n.01')],
)
self.assertEqual(S('course.n.7').part_holonyms(), [S('meal.n.01')])
self.assertEqual(
S('water.n.1').substance_meronyms(), [S('hydrogen.n.01'), S('oxygen.n.01')]
)
self.assertEqual(
S('gin.n.1').substance_holonyms(),
[
S('gin_and_it.n.01'),
S('gin_and_tonic.n.01'),
S('martini.n.01'),
S('pink_lady.n.01'),
],
)
def test_antonyms(self):
# Test antonyms.
self.assertEqual(
L('leader.n.1.leader').antonyms(), [L('follower.n.01.follower')]
)
self.assertEqual(
L('increase.v.1.increase').antonyms(), [L('decrease.v.01.decrease')]
)
def test_misc_relations(self):
# Test misc relations.
self.assertEqual(S('snore.v.1').entailments(), [S('sleep.v.01')])
self.assertEqual(
S('heavy.a.1').similar_tos(),
[
S('dense.s.03'),
S('doughy.s.01'),
S('heavier-than-air.s.01'),
S('hefty.s.02'),
S('massive.s.04'),
S('non-buoyant.s.01'),
S('ponderous.s.02'),
],
)
self.assertEqual(S('light.a.1').attributes(), [S('weight.n.01')])
self.assertEqual(S('heavy.a.1').attributes(), [S('weight.n.01')])
# Test pertainyms.
self.assertEqual(
L('English.a.1.English').pertainyms(), [L('england.n.01.England')]
)
def test_lch(self):
# Test LCH.
self.assertEqual(
S('person.n.01').lowest_common_hypernyms(S('dog.n.01')),
[S('organism.n.01')],
)
self.assertEqual(
S('woman.n.01').lowest_common_hypernyms(S('girlfriend.n.02')),
[S('woman.n.01')],
)
def test_domains(self):
# Test domains.
self.assertEqual(S('code.n.03').topic_domains(), [S('computer_science.n.01')])
self.assertEqual(S('pukka.a.01').region_domains(), [S('india.n.01')])
self.assertEqual(S('freaky.a.01').usage_domains(), [S('slang.n.02')])
def test_in_topic_domains(self):
# Test in domains.
self.assertEqual(
S('computer_science.n.01').in_topic_domains()[0], S('access.n.05')
)
self.assertEqual(S('germany.n.01').in_region_domains()[23], S('trillion.n.02'))
self.assertEqual(S('slang.n.02').in_usage_domains()[1], S('airhead.n.01'))
def test_wordnet_similarities(self):
# Path based similarities.
self.assertAlmostEqual(S('cat.n.01').path_similarity(S('cat.n.01')), 1.0)
self.assertAlmostEqual(S('dog.n.01').path_similarity(S('cat.n.01')), 0.2)
self.assertAlmostEqual(
S('dog.n.01').lch_similarity(S('cat.n.01')), 2.028, places=3
)
self.assertAlmostEqual(
S('dog.n.01').wup_similarity(S('cat.n.01')), 0.8571, places=3
)
# Information Content similarities.
brown_ic = wnic.ic('ic-brown.dat')
self.assertAlmostEqual(
S('dog.n.01').jcn_similarity(S('cat.n.01'), brown_ic), 0.4497, places=3
)
semcor_ic = wnic.ic('ic-semcor.dat')
self.assertAlmostEqual(
S('dog.n.01').lin_similarity(S('cat.n.01'), semcor_ic), 0.8863, places=3
)
def test_omw_lemma_no_trailing_underscore(self):
expected = [
u'popolna_sprememba_v_mišljenju',
u'popoln_obrat',
u'preobrat',
u'preobrat_v_mišljenju'
]
self.assertEqual(S('about-face.n.02').lemma_names(lang='slv'), expected)
def test_iterable_type_for_all_lemma_names(self):
# Duck-test for iterables.
# See https://stackoverflow.com/a/36230057/610569
cat_lemmas = wn.all_lemma_names(lang='cat')
eng_lemmas = wn.all_lemma_names(lang='eng')
self.assertTrue(hasattr(eng_lemmas, '__iter__'))
self.assertTrue(hasattr(eng_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
self.assertTrue(eng_lemmas.__iter__() is eng_lemmas)
self.assertTrue(hasattr(cat_lemmas, '__iter__'))
self.assertTrue(hasattr(cat_lemmas, '__next__') or hasattr(eng_lemmas, 'next'))
self.assertTrue(cat_lemmas.__iter__() is cat_lemmas)

View File

@@ -0,0 +1,271 @@
# -*- coding: utf-8 -*-
"""
Tests for BLEU translation evaluation metric
"""
import functools
import io
import unittest
from nltk.data import find
from nltk.translate.bleu_score import (
modified_precision,
brevity_penalty,
closest_ref_length,
)
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
class TestBLEU(unittest.TestCase):
def test_modified_precision(self):
"""
Examples from the original BLEU paper
http://www.aclweb.org/anthology/P02-1040.pdf
"""
# Example 1: the "the*" example.
# Reference sentences.
ref1 = 'the cat is on the mat'.split()
ref2 = 'there is a cat on the mat'.split()
# Hypothesis sentence(s).
hyp1 = 'the the the the the the the'.split()
references = [ref1, ref2]
# Testing modified unigram precision.
hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
assert round(hyp1_unigram_precision, 4) == 0.2857
# With assertAlmostEqual at 4 place precision.
self.assertAlmostEqual(hyp1_unigram_precision, 0.28571428, places=4)
# Testing modified bigram precision.
assert float(modified_precision(references, hyp1, n=2)) == 0.0
# Example 2: the "of the" example.
# Reference sentences
ref1 = str(
'It is a guide to action that ensures that the military '
'will forever heed Party commands'
).split()
ref2 = str(
'It is the guiding principle which guarantees the military '
'forces always being under the command of the Party'
).split()
ref3 = str(
'It is the practical guide for the army always to heed '
'the directions of the party'
).split()
# Hypothesis sentence(s).
hyp1 = 'of the'.split()
references = [ref1, ref2, ref3]
# Testing modified unigram precision.
assert float(modified_precision(references, hyp1, n=1)) == 1.0
# Testing modified bigram precision.
assert float(modified_precision(references, hyp1, n=2)) == 1.0
# Example 3: Proper MT outputs.
hyp1 = str(
'It is a guide to action which ensures that the military '
'always obeys the commands of the party'
).split()
hyp2 = str(
'It is to insure the troops forever hearing the activity '
'guidebook that party direct'
).split()
references = [ref1, ref2, ref3]
# Unigram precision.
hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
hyp2_unigram_precision = float(modified_precision(references, hyp2, n=1))
# Test unigram precision with assertAlmostEqual at 4 place precision.
self.assertAlmostEqual(hyp1_unigram_precision, 0.94444444, places=4)
self.assertAlmostEqual(hyp2_unigram_precision, 0.57142857, places=4)
# Test unigram precision with rounding.
assert round(hyp1_unigram_precision, 4) == 0.9444
assert round(hyp2_unigram_precision, 4) == 0.5714
# Bigram precision
hyp1_bigram_precision = float(modified_precision(references, hyp1, n=2))
hyp2_bigram_precision = float(modified_precision(references, hyp2, n=2))
# Test bigram precision with assertAlmostEqual at 4 place precision.
self.assertAlmostEqual(hyp1_bigram_precision, 0.58823529, places=4)
self.assertAlmostEqual(hyp2_bigram_precision, 0.07692307, places=4)
# Test bigram precision with rounding.
assert round(hyp1_bigram_precision, 4) == 0.5882
assert round(hyp2_bigram_precision, 4) == 0.0769
def test_brevity_penalty(self):
# Test case from brevity_penalty_closest function in mteval-v13a.pl.
# Same test cases as in the doctest in nltk.translate.bleu_score.py
references = [['a'] * 11, ['a'] * 8]
hypothesis = ['a'] * 7
hyp_len = len(hypothesis)
closest_ref_len = closest_ref_length(references, hyp_len)
self.assertAlmostEqual(
brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4
)
references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
hypothesis = ['a'] * 7
hyp_len = len(hypothesis)
closest_ref_len = closest_ref_length(references, hyp_len)
assert brevity_penalty(closest_ref_len, hyp_len) == 1.0
def test_zero_matches(self):
# Test case where there's 0 matches
references = ['The candidate has no alignment to any of the references'.split()]
hypothesis = 'John loves Mary'.split()
# Test BLEU to nth order of n-grams, where n is len(hypothesis).
for n in range(1, len(hypothesis)):
weights = [1.0 / n] * n # Uniform weights.
assert sentence_bleu(references, hypothesis, weights) == 0
def test_full_matches(self):
# Test case where there's 100% matches
references = ['John loves Mary'.split()]
hypothesis = 'John loves Mary'.split()
# Test BLEU to nth order of n-grams, where n is len(hypothesis).
for n in range(1, len(hypothesis)):
weights = [1.0 / n] * n # Uniform weights.
assert sentence_bleu(references, hypothesis, weights) == 1.0
def test_partial_matches_hypothesis_longer_than_reference(self):
references = ['John loves Mary'.split()]
hypothesis = 'John loves Mary who loves Mike'.split()
# Since no 4-grams matches were found the result should be zero
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
# Checks that the warning has been raised because len(reference) < 4.
try:
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
except AttributeError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
# @unittest.skip("Skipping fringe cases for BLEU.")
class TestBLEUFringeCases(unittest.TestCase):
def test_case_where_n_is_bigger_than_hypothesis_length(self):
# Test BLEU to nth order of n-grams, where n > len(hypothesis).
references = ['John loves Mary ?'.split()]
hypothesis = 'John loves Mary'.split()
n = len(hypothesis) + 1 #
weights = [1.0 / n] * n # Uniform weights.
# Since no n-grams matches were found the result should be zero
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(
sentence_bleu(references, hypothesis, weights), 0.0, places=4
)
# Checks that the warning has been raised because len(hypothesis) < 4.
try:
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
except AttributeError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
# Test case where n > len(hypothesis) but so is n > len(reference), and
# it's a special case where reference == hypothesis.
references = ['John loves Mary'.split()]
hypothesis = 'John loves Mary'.split()
# Since no 4-grams matches were found the result should be zero
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(
sentence_bleu(references, hypothesis, weights), 0.0, places=4
)
def test_empty_hypothesis(self):
# Test case where there's hypothesis is empty.
references = ['The candidate has no alignment to any of the references'.split()]
hypothesis = []
assert sentence_bleu(references, hypothesis) == 0
def test_empty_references(self):
# Test case where there's reference is empty.
references = [[]]
hypothesis = 'John loves Mary'.split()
assert sentence_bleu(references, hypothesis) == 0
def test_empty_references_and_hypothesis(self):
# Test case where both references and hypothesis is empty.
references = [[]]
hypothesis = []
assert sentence_bleu(references, hypothesis) == 0
def test_reference_or_hypothesis_shorter_than_fourgrams(self):
# Tese case where the length of reference or hypothesis
# is shorter than 4.
references = ['let it go'.split()]
hypothesis = 'let go it'.split()
# Checks that the value the hypothesis and reference returns is 0.0
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
# Checks that the warning has been raised.
try:
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
except AttributeError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
class TestBLEUvsMteval13a(unittest.TestCase):
def test_corpus_bleu(self):
ref_file = find('models/wmt15_eval/ref.ru')
hyp_file = find('models/wmt15_eval/google.ru')
mteval_output_file = find('models/wmt15_eval/mteval-13a.output')
# Reads the BLEU scores from the `mteval-13a.output` file.
# The order of the list corresponds to the order of the ngrams.
with open(mteval_output_file, 'r') as mteval_fin:
# The numbers are located in the last 2nd line of the file.
# The first and 2nd item in the list are the score and system names.
mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])
with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
# Whitespace tokenize the file.
# Note: split() automatically strip().
hypothesis = list(map(lambda x: x.split(), hyp_fin))
# Note that the corpus_bleu input is list of list of references.
references = list(map(lambda x: [x.split()], ref_fin))
# Without smoothing.
for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
nltk_bleu = corpus_bleu(
references, hypothesis, weights=(1.0 / i,) * i
)
# Check that the BLEU scores difference is less than 0.005 .
# Note: This is an approximate comparison; as much as
# +/- 0.01 BLEU might be "statistically significant",
# the actual translation quality might not be.
assert abs(mteval_bleu - nltk_bleu) < 0.005
# With the same smoothing method used in mteval-v13a.pl
chencherry = SmoothingFunction()
for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
nltk_bleu = corpus_bleu(
references,
hypothesis,
weights=(1.0 / i,) * i,
smoothing_function=chencherry.method3,
)
assert abs(mteval_bleu - nltk_bleu) < 0.005
class TestBLEUWithBadSentence(unittest.TestCase):
def test_corpus_bleu_with_bad_sentence(self):
hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R"
ref = str(
"Their tasks include changing a pump on the faulty stokehold ."
"Likewise , two species that are very similar in morphology "
"were distinguished using genetics ."
)
references = [[ref.split()]]
hypotheses = [hyp.split()]
try: # Check that the warning is raised since no. of 2-grams < 0.
with self.assertWarns(UserWarning):
# Verify that the BLEU output is undesired since no. of 2-grams < 0.
self.assertAlmostEqual(
corpus_bleu(references, hypotheses), 0.0, places=4
)
except AttributeError: # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4)

View File

@@ -0,0 +1,157 @@
# -*- coding: utf-8 -*-
"""
Tests GDFA alignments
"""
import functools
import io
import unittest
from nltk.translate.gdfa import grow_diag_final_and
class TestGDFA(unittest.TestCase):
def test_from_eflomal_outputs(self):
"""
Testing GDFA with first 10 eflomal outputs from issue #1829
https://github.com/nltk/nltk/issues/1829
"""
# Input.
forwards = [
'0-0 1-2',
'0-0 1-1',
'0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 7-8 9-9 10-10 9-11 11-12 12-13 13-14',
'0-0 1-1 1-2 2-3 3-4 4-5 4-6 5-7 6-8 8-9 9-10',
'0-0 14-1 15-2 16-3 20-5 21-6 22-7 5-8 6-9 7-10 8-11 9-12 10-13 11-14 12-15 13-16 14-17 17-18 18-19 19-20 20-21 23-22 24-23 25-24 26-25 27-27 28-28 29-29 30-30 31-31',
'0-0 1-1 0-2 2-3',
'0-0 2-2 4-4',
'0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-20',
'3-0 4-1 6-2 5-3 6-4 7-5 8-6 9-7 10-8 11-9 16-10 9-12 10-13 12-14',
'1-0',
]
backwards = [
'0-0 1-2',
'0-0 1-1',
'0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 9-8 10-10 11-12 12-11 13-13',
'0-0 1-2 2-3 3-4 4-6 6-8 7-5 8-7 9-8',
'0-0 1-8 2-9 3-10 4-11 5-12 6-11 8-13 9-14 10-15 11-16 12-17 13-18 14-19 15-20 16-21 17-22 18-23 19-24 20-29 21-30 22-31 23-2 24-3 25-4 26-5 27-5 28-6 29-7 30-28 31-31',
'0-0 1-1 2-3',
'0-0 1-1 2-3 4-4',
'0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-16 21-18',
'0-0 1-1 3-2 4-1 5-3 6-4 7-5 8-6 9-7 10-8 11-9 12-8 13-9 14-8 15-9 16-10',
'1-0',
]
source_lens = [2, 3, 3, 15, 11, 33, 4, 6, 23, 18]
target_lens = [2, 4, 3, 16, 12, 33, 5, 6, 22, 16]
# Expected Output.
expected = [
[(0, 0), (1, 2)],
[(0, 0), (1, 1)],
[
(0, 0),
(2, 1),
(3, 2),
(4, 3),
(5, 4),
(6, 5),
(7, 6),
(8, 7),
(10, 10),
(11, 12),
],
[
(0, 0),
(1, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(4, 6),
(5, 7),
(6, 8),
(7, 5),
(8, 7),
(8, 9),
(9, 8),
(9, 10),
],
[
(0, 0),
(1, 8),
(2, 9),
(3, 10),
(4, 11),
(5, 8),
(6, 9),
(6, 11),
(7, 10),
(8, 11),
(31, 31),
],
[(0, 0), (0, 2), (1, 1), (2, 3)],
[(0, 0), (1, 1), (2, 2), (2, 3), (4, 4)],
[
(0, 0),
(1, 1),
(2, 3),
(3, 4),
(5, 5),
(7, 6),
(8, 7),
(9, 8),
(10, 9),
(11, 10),
(12, 11),
(13, 12),
(14, 13),
(15, 14),
(16, 16),
(17, 17),
(18, 18),
(19, 19),
],
[
(0, 0),
(1, 1),
(3, 0),
(3, 2),
(4, 1),
(5, 3),
(6, 2),
(6, 4),
(7, 5),
(8, 6),
(9, 7),
(9, 12),
(10, 8),
(10, 13),
(11, 9),
(12, 8),
(12, 14),
(13, 9),
(14, 8),
(15, 9),
(16, 10),
],
[(1, 0)],
[
(0, 0),
(1, 1),
(3, 2),
(4, 3),
(5, 4),
(6, 5),
(7, 6),
(9, 10),
(10, 12),
(11, 13),
(12, 14),
(13, 15),
],
]
# Iterate through all 10 examples and check for expected outputs.
for fw, bw, src_len, trg_len, expect in zip(
forwards, backwards, source_lens, target_lens, expected
):
self.assertListEqual(expect, grow_diag_final_and(src_len, trg_len, fw, bw))

View File

@@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
"""
Tests for IBM Model 1 training methods
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent
from nltk.translate import IBMModel
from nltk.translate import IBMModel1
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel1(unittest.TestCase):
def test_set_uniform_translation_probabilities(self):
# arrange
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model1 = IBMModel1(corpus, 0)
# act
model1.set_uniform_probabilities(corpus)
# assert
# expected_prob = 1.0 / (target vocab size + 1)
self.assertEqual(model1.translation_table['ham']['eier'], 1.0 / 3)
self.assertEqual(model1.translation_table['eggs'][None], 1.0 / 3)
def test_set_uniform_translation_probabilities_of_non_domain_values(self):
# arrange
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model1 = IBMModel1(corpus, 0)
# act
model1.set_uniform_probabilities(corpus)
# assert
# examine target words that are not in the training data domain
self.assertEqual(model1.translation_table['parrot']['eier'], IBMModel.MIN_PROB)
def test_prob_t_a_given_s(self):
# arrange
src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
corpus = [AlignedSent(trg_sentence, src_sentence)]
alignment_info = AlignmentInfo(
(0, 1, 4, 0, 2, 5, 5),
[None] + src_sentence,
['UNUSED'] + trg_sentence,
None,
)
translation_table = defaultdict(lambda: defaultdict(float))
translation_table['i']['ich'] = 0.98
translation_table['love']['gern'] = 0.98
translation_table['to'][None] = 0.98
translation_table['eat']['esse'] = 0.98
translation_table['smoked']['räucherschinken'] = 0.98
translation_table['ham']['räucherschinken'] = 0.98
model1 = IBMModel1(corpus, 0)
model1.translation_table = translation_table
# act
probability = model1.prob_t_a_given_s(alignment_info)
# assert
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
expected_probability = lexical_translation
self.assertEqual(round(probability, 4), round(expected_probability, 4))

View File

@@ -0,0 +1,89 @@
# -*- coding: utf-8 -*-
"""
Tests for IBM Model 2 training methods
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent
from nltk.translate import IBMModel
from nltk.translate import IBMModel2
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel2(unittest.TestCase):
def test_set_uniform_alignment_probabilities(self):
# arrange
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model2 = IBMModel2(corpus, 0)
# act
model2.set_uniform_probabilities(corpus)
# assert
# expected_prob = 1.0 / (length of source sentence + 1)
self.assertEqual(model2.alignment_table[0][1][3][2], 1.0 / 4)
self.assertEqual(model2.alignment_table[2][4][2][4], 1.0 / 3)
def test_set_uniform_alignment_probabilities_of_non_domain_values(self):
# arrange
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model2 = IBMModel2(corpus, 0)
# act
model2.set_uniform_probabilities(corpus)
# assert
# examine i and j values that are not in the training data domain
self.assertEqual(model2.alignment_table[99][1][3][2], IBMModel.MIN_PROB)
self.assertEqual(model2.alignment_table[2][99][2][4], IBMModel.MIN_PROB)
def test_prob_t_a_given_s(self):
# arrange
src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
corpus = [AlignedSent(trg_sentence, src_sentence)]
alignment_info = AlignmentInfo(
(0, 1, 4, 0, 2, 5, 5),
[None] + src_sentence,
['UNUSED'] + trg_sentence,
None,
)
translation_table = defaultdict(lambda: defaultdict(float))
translation_table['i']['ich'] = 0.98
translation_table['love']['gern'] = 0.98
translation_table['to'][None] = 0.98
translation_table['eat']['esse'] = 0.98
translation_table['smoked']['räucherschinken'] = 0.98
translation_table['ham']['räucherschinken'] = 0.98
alignment_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
)
alignment_table[0][3][5][6] = 0.97 # None -> to
alignment_table[1][1][5][6] = 0.97 # ich -> i
alignment_table[2][4][5][6] = 0.97 # esse -> eat
alignment_table[4][2][5][6] = 0.97 # gern -> love
alignment_table[5][5][5][6] = 0.96 # räucherschinken -> smoked
alignment_table[5][6][5][6] = 0.96 # räucherschinken -> ham
model2 = IBMModel2(corpus, 0)
model2.translation_table = translation_table
model2.alignment_table = alignment_table
# act
probability = model2.prob_t_a_given_s(alignment_info)
# assert
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
alignment = 0.97 * 0.97 * 0.97 * 0.97 * 0.96 * 0.96
expected_probability = lexical_translation * alignment
self.assertEqual(round(probability, 4), round(expected_probability, 4))

View File

@@ -0,0 +1,108 @@
# -*- coding: utf-8 -*-
"""
Tests for IBM Model 3 training methods
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent
from nltk.translate import IBMModel
from nltk.translate import IBMModel3
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel3(unittest.TestCase):
def test_set_uniform_distortion_probabilities(self):
# arrange
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model3 = IBMModel3(corpus, 0)
# act
model3.set_uniform_probabilities(corpus)
# assert
# expected_prob = 1.0 / length of target sentence
self.assertEqual(model3.distortion_table[1][0][3][2], 1.0 / 2)
self.assertEqual(model3.distortion_table[4][2][2][4], 1.0 / 4)
def test_set_uniform_distortion_probabilities_of_non_domain_values(self):
# arrange
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model3 = IBMModel3(corpus, 0)
# act
model3.set_uniform_probabilities(corpus)
# assert
# examine i and j values that are not in the training data domain
self.assertEqual(model3.distortion_table[0][0][3][2], IBMModel.MIN_PROB)
self.assertEqual(model3.distortion_table[9][2][2][4], IBMModel.MIN_PROB)
self.assertEqual(model3.distortion_table[2][9][2][4], IBMModel.MIN_PROB)
def test_prob_t_a_given_s(self):
# arrange
src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
corpus = [AlignedSent(trg_sentence, src_sentence)]
alignment_info = AlignmentInfo(
(0, 1, 4, 0, 2, 5, 5),
[None] + src_sentence,
['UNUSED'] + trg_sentence,
[[3], [1], [4], [], [2], [5, 6]],
)
distortion_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
)
distortion_table[1][1][5][6] = 0.97 # i -> ich
distortion_table[2][4][5][6] = 0.97 # love -> gern
distortion_table[3][0][5][6] = 0.97 # to -> NULL
distortion_table[4][2][5][6] = 0.97 # eat -> esse
distortion_table[5][5][5][6] = 0.97 # smoked -> räucherschinken
distortion_table[6][5][5][6] = 0.97 # ham -> räucherschinken
translation_table = defaultdict(lambda: defaultdict(float))
translation_table['i']['ich'] = 0.98
translation_table['love']['gern'] = 0.98
translation_table['to'][None] = 0.98
translation_table['eat']['esse'] = 0.98
translation_table['smoked']['räucherschinken'] = 0.98
translation_table['ham']['räucherschinken'] = 0.98
fertility_table = defaultdict(lambda: defaultdict(float))
fertility_table[1]['ich'] = 0.99
fertility_table[1]['esse'] = 0.99
fertility_table[0]['ja'] = 0.99
fertility_table[1]['gern'] = 0.99
fertility_table[2]['räucherschinken'] = 0.999
fertility_table[1][None] = 0.99
probabilities = {
'p1': 0.167,
'translation_table': translation_table,
'distortion_table': distortion_table,
'fertility_table': fertility_table,
'alignment_table': None,
}
model3 = IBMModel3(corpus, 0, probabilities)
# act
probability = model3.prob_t_a_given_s(alignment_info)
# assert
null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
distortion = 0.97 * 0.97 * 0.97 * 0.97 * 0.97 * 0.97
expected_probability = (
null_generation * fertility * lexical_translation * distortion
)
self.assertEqual(round(probability, 4), round(expected_probability, 4))

View File

@@ -0,0 +1,123 @@
# -*- coding: utf-8 -*-
"""
Tests for IBM Model 4 training methods
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent
from nltk.translate import IBMModel
from nltk.translate import IBMModel4
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel4(unittest.TestCase):
def test_set_uniform_distortion_probabilities_of_max_displacements(self):
# arrange
src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model4 = IBMModel4(corpus, 0, src_classes, trg_classes)
# act
model4.set_uniform_probabilities(corpus)
# assert
# number of displacement values =
# 2 *(number of words in longest target sentence - 1)
expected_prob = 1.0 / (2 * (4 - 1))
# examine the boundary values for (displacement, src_class, trg_class)
self.assertEqual(model4.head_distortion_table[3][0][0], expected_prob)
self.assertEqual(model4.head_distortion_table[-3][1][2], expected_prob)
self.assertEqual(model4.non_head_distortion_table[3][0], expected_prob)
self.assertEqual(model4.non_head_distortion_table[-3][2], expected_prob)
def test_set_uniform_distortion_probabilities_of_non_domain_values(self):
# arrange
src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model4 = IBMModel4(corpus, 0, src_classes, trg_classes)
# act
model4.set_uniform_probabilities(corpus)
# assert
# examine displacement values that are not in the training data domain
self.assertEqual(model4.head_distortion_table[4][0][0], IBMModel.MIN_PROB)
self.assertEqual(model4.head_distortion_table[100][1][2], IBMModel.MIN_PROB)
self.assertEqual(model4.non_head_distortion_table[4][0], IBMModel.MIN_PROB)
self.assertEqual(model4.non_head_distortion_table[100][2], IBMModel.MIN_PROB)
def test_prob_t_a_given_s(self):
# arrange
src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
src_classes = {'räucherschinken': 0, 'ja': 1, 'ich': 2, 'esse': 3, 'gern': 4}
trg_classes = {'ham': 0, 'smoked': 1, 'i': 3, 'love': 4, 'to': 2, 'eat': 4}
corpus = [AlignedSent(trg_sentence, src_sentence)]
alignment_info = AlignmentInfo(
(0, 1, 4, 0, 2, 5, 5),
[None] + src_sentence,
['UNUSED'] + trg_sentence,
[[3], [1], [4], [], [2], [5, 6]],
)
head_distortion_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(float))
)
head_distortion_table[1][None][3] = 0.97 # None, i
head_distortion_table[3][2][4] = 0.97 # ich, eat
head_distortion_table[-2][3][4] = 0.97 # esse, love
head_distortion_table[3][4][1] = 0.97 # gern, smoked
non_head_distortion_table = defaultdict(lambda: defaultdict(float))
non_head_distortion_table[1][0] = 0.96 # ham
translation_table = defaultdict(lambda: defaultdict(float))
translation_table['i']['ich'] = 0.98
translation_table['love']['gern'] = 0.98
translation_table['to'][None] = 0.98
translation_table['eat']['esse'] = 0.98
translation_table['smoked']['räucherschinken'] = 0.98
translation_table['ham']['räucherschinken'] = 0.98
fertility_table = defaultdict(lambda: defaultdict(float))
fertility_table[1]['ich'] = 0.99
fertility_table[1]['esse'] = 0.99
fertility_table[0]['ja'] = 0.99
fertility_table[1]['gern'] = 0.99
fertility_table[2]['räucherschinken'] = 0.999
fertility_table[1][None] = 0.99
probabilities = {
'p1': 0.167,
'translation_table': translation_table,
'head_distortion_table': head_distortion_table,
'non_head_distortion_table': non_head_distortion_table,
'fertility_table': fertility_table,
'alignment_table': None,
}
model4 = IBMModel4(corpus, 0, src_classes, trg_classes, probabilities)
# act
probability = model4.prob_t_a_given_s(alignment_info)
# assert
null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
distortion = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96
expected_probability = (
null_generation * fertility * lexical_translation * distortion
)
self.assertEqual(round(probability, 4), round(expected_probability, 4))

View File

@@ -0,0 +1,164 @@
# -*- coding: utf-8 -*-
"""
Tests for IBM Model 5 training methods
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent
from nltk.translate import IBMModel
from nltk.translate import IBMModel4
from nltk.translate import IBMModel5
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel5(unittest.TestCase):
def test_set_uniform_vacancy_probabilities_of_max_displacements(self):
# arrange
src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model5 = IBMModel5(corpus, 0, src_classes, trg_classes)
# act
model5.set_uniform_probabilities(corpus)
# assert
# number of vacancy difference values =
# 2 * number of words in longest target sentence
expected_prob = 1.0 / (2 * 4)
# examine the boundary values for (dv, max_v, trg_class)
self.assertEqual(model5.head_vacancy_table[4][4][0], expected_prob)
self.assertEqual(model5.head_vacancy_table[-3][1][2], expected_prob)
self.assertEqual(model5.non_head_vacancy_table[4][4][0], expected_prob)
self.assertEqual(model5.non_head_vacancy_table[-3][1][2], expected_prob)
def test_set_uniform_vacancy_probabilities_of_non_domain_values(self):
# arrange
src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model5 = IBMModel5(corpus, 0, src_classes, trg_classes)
# act
model5.set_uniform_probabilities(corpus)
# assert
# examine dv and max_v values that are not in the training data domain
self.assertEqual(model5.head_vacancy_table[5][4][0], IBMModel.MIN_PROB)
self.assertEqual(model5.head_vacancy_table[-4][1][2], IBMModel.MIN_PROB)
self.assertEqual(model5.head_vacancy_table[4][0][0], IBMModel.MIN_PROB)
self.assertEqual(model5.non_head_vacancy_table[5][4][0], IBMModel.MIN_PROB)
self.assertEqual(model5.non_head_vacancy_table[-4][1][2], IBMModel.MIN_PROB)
def test_prob_t_a_given_s(self):
# arrange
src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
src_classes = {'räucherschinken': 0, 'ja': 1, 'ich': 2, 'esse': 3, 'gern': 4}
trg_classes = {'ham': 0, 'smoked': 1, 'i': 3, 'love': 4, 'to': 2, 'eat': 4}
corpus = [AlignedSent(trg_sentence, src_sentence)]
alignment_info = AlignmentInfo(
(0, 1, 4, 0, 2, 5, 5),
[None] + src_sentence,
['UNUSED'] + trg_sentence,
[[3], [1], [4], [], [2], [5, 6]],
)
head_vacancy_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(float))
)
head_vacancy_table[1 - 0][6][3] = 0.97 # ich -> i
head_vacancy_table[3 - 0][5][4] = 0.97 # esse -> eat
head_vacancy_table[1 - 2][4][4] = 0.97 # gern -> love
head_vacancy_table[2 - 0][2][1] = 0.97 # räucherschinken -> smoked
non_head_vacancy_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(float))
)
non_head_vacancy_table[1 - 0][1][0] = 0.96 # räucherschinken -> ham
translation_table = defaultdict(lambda: defaultdict(float))
translation_table['i']['ich'] = 0.98
translation_table['love']['gern'] = 0.98
translation_table['to'][None] = 0.98
translation_table['eat']['esse'] = 0.98
translation_table['smoked']['räucherschinken'] = 0.98
translation_table['ham']['räucherschinken'] = 0.98
fertility_table = defaultdict(lambda: defaultdict(float))
fertility_table[1]['ich'] = 0.99
fertility_table[1]['esse'] = 0.99
fertility_table[0]['ja'] = 0.99
fertility_table[1]['gern'] = 0.99
fertility_table[2]['räucherschinken'] = 0.999
fertility_table[1][None] = 0.99
probabilities = {
'p1': 0.167,
'translation_table': translation_table,
'fertility_table': fertility_table,
'head_vacancy_table': head_vacancy_table,
'non_head_vacancy_table': non_head_vacancy_table,
'head_distortion_table': None,
'non_head_distortion_table': None,
'alignment_table': None,
}
model5 = IBMModel5(corpus, 0, src_classes, trg_classes, probabilities)
# act
probability = model5.prob_t_a_given_s(alignment_info)
# assert
null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
vacancy = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96
expected_probability = (
null_generation * fertility * lexical_translation * vacancy
)
self.assertEqual(round(probability, 4), round(expected_probability, 4))
def test_prune(self):
# arrange
alignment_infos = [
AlignmentInfo((1, 1), None, None, None),
AlignmentInfo((1, 2), None, None, None),
AlignmentInfo((2, 1), None, None, None),
AlignmentInfo((2, 2), None, None, None),
AlignmentInfo((0, 0), None, None, None),
]
min_factor = IBMModel5.MIN_SCORE_FACTOR
best_score = 0.9
scores = {
(1, 1): min(min_factor * 1.5, 1) * best_score, # above threshold
(1, 2): best_score,
(2, 1): min_factor * best_score, # at threshold
(2, 2): min_factor * best_score * 0.5, # low score
(0, 0): min(min_factor * 1.1, 1) * 1.2, # above threshold
}
corpus = [AlignedSent(['a'], ['b'])]
original_prob_function = IBMModel4.model4_prob_t_a_given_s
# mock static method
IBMModel4.model4_prob_t_a_given_s = staticmethod(
lambda a, model: scores[a.alignment]
)
model5 = IBMModel5(corpus, 0, None, None)
# act
pruned_alignments = model5.prune(alignment_infos)
# assert
self.assertEqual(len(pruned_alignments), 3)
# restore static method
IBMModel4.model4_prob_t_a_given_s = original_prob_function

View File

@@ -0,0 +1,279 @@
# -*- coding: utf-8 -*-
"""
Tests for common methods of IBM translation models
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent
from nltk.translate import IBMModel
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel(unittest.TestCase):
__TEST_SRC_SENTENCE = ["j'", 'aime', 'bien', 'jambon']
__TEST_TRG_SENTENCE = ['i', 'love', 'ham']
def test_vocabularies_are_initialized(self):
parallel_corpora = [
AlignedSent(['one', 'two', 'three', 'four'], ['un', 'deux', 'trois']),
AlignedSent(['five', 'one', 'six'], ['quatre', 'cinq', 'six']),
AlignedSent([], ['sept']),
]
ibm_model = IBMModel(parallel_corpora)
self.assertEqual(len(ibm_model.src_vocab), 8)
self.assertEqual(len(ibm_model.trg_vocab), 6)
def test_vocabularies_are_initialized_even_with_empty_corpora(self):
parallel_corpora = []
ibm_model = IBMModel(parallel_corpora)
self.assertEqual(len(ibm_model.src_vocab), 1) # addition of NULL token
self.assertEqual(len(ibm_model.trg_vocab), 0)
def test_best_model2_alignment(self):
# arrange
sentence_pair = AlignedSent(
TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
)
# None and 'bien' have zero fertility
translation_table = {
'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0},
'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03},
'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0},
}
alignment_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
)
ibm_model = IBMModel([])
ibm_model.translation_table = translation_table
ibm_model.alignment_table = alignment_table
# act
a_info = ibm_model.best_model2_alignment(sentence_pair)
# assert
self.assertEqual(a_info.alignment[1:], (1, 2, 4)) # 0th element unused
self.assertEqual(a_info.cepts, [[], [1], [2], [], [3]])
def test_best_model2_alignment_does_not_change_pegged_alignment(self):
# arrange
sentence_pair = AlignedSent(
TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
)
translation_table = {
'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0},
'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03},
'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0},
}
alignment_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
)
ibm_model = IBMModel([])
ibm_model.translation_table = translation_table
ibm_model.alignment_table = alignment_table
# act: force 'love' to be pegged to 'jambon'
a_info = ibm_model.best_model2_alignment(sentence_pair, 2, 4)
# assert
self.assertEqual(a_info.alignment[1:], (1, 4, 4))
self.assertEqual(a_info.cepts, [[], [1], [], [], [2, 3]])
def test_best_model2_alignment_handles_fertile_words(self):
# arrange
sentence_pair = AlignedSent(
['i', 'really', ',', 'really', 'love', 'ham'],
TestIBMModel.__TEST_SRC_SENTENCE,
)
# 'bien' produces 2 target words: 'really' and another 'really'
translation_table = {
'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0},
'really': {"j'": 0, 'aime': 0, 'bien': 0.9, 'jambon': 0.01, None: 0.09},
',': {"j'": 0, 'aime': 0, 'bien': 0.3, 'jambon': 0, None: 0.7},
'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03},
'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0},
}
alignment_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
)
ibm_model = IBMModel([])
ibm_model.translation_table = translation_table
ibm_model.alignment_table = alignment_table
# act
a_info = ibm_model.best_model2_alignment(sentence_pair)
# assert
self.assertEqual(a_info.alignment[1:], (1, 3, 0, 3, 2, 4))
self.assertEqual(a_info.cepts, [[3], [1], [5], [2, 4], [6]])
def test_best_model2_alignment_handles_empty_src_sentence(self):
# arrange
sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, [])
ibm_model = IBMModel([])
# act
a_info = ibm_model.best_model2_alignment(sentence_pair)
# assert
self.assertEqual(a_info.alignment[1:], (0, 0, 0))
self.assertEqual(a_info.cepts, [[1, 2, 3]])
def test_best_model2_alignment_handles_empty_trg_sentence(self):
# arrange
sentence_pair = AlignedSent([], TestIBMModel.__TEST_SRC_SENTENCE)
ibm_model = IBMModel([])
# act
a_info = ibm_model.best_model2_alignment(sentence_pair)
# assert
self.assertEqual(a_info.alignment[1:], ())
self.assertEqual(a_info.cepts, [[], [], [], [], []])
def test_neighboring_finds_neighbor_alignments(self):
# arrange
a_info = AlignmentInfo(
(0, 3, 2),
(None, 'des', 'œufs', 'verts'),
('UNUSED', 'green', 'eggs'),
[[], [], [2], [1]],
)
ibm_model = IBMModel([])
# act
neighbors = ibm_model.neighboring(a_info)
# assert
neighbor_alignments = set()
for neighbor in neighbors:
neighbor_alignments.add(neighbor.alignment)
expected_alignments = set(
[
# moves
(0, 0, 2),
(0, 1, 2),
(0, 2, 2),
(0, 3, 0),
(0, 3, 1),
(0, 3, 3),
# swaps
(0, 2, 3),
# original alignment
(0, 3, 2),
]
)
self.assertEqual(neighbor_alignments, expected_alignments)
def test_neighboring_sets_neighbor_alignment_info(self):
# arrange
a_info = AlignmentInfo(
(0, 3, 2),
(None, 'des', 'œufs', 'verts'),
('UNUSED', 'green', 'eggs'),
[[], [], [2], [1]],
)
ibm_model = IBMModel([])
# act
neighbors = ibm_model.neighboring(a_info)
# assert: select a few particular alignments
for neighbor in neighbors:
if neighbor.alignment == (0, 2, 2):
moved_alignment = neighbor
elif neighbor.alignment == (0, 3, 2):
swapped_alignment = neighbor
self.assertEqual(moved_alignment.cepts, [[], [], [1, 2], []])
self.assertEqual(swapped_alignment.cepts, [[], [], [2], [1]])
def test_neighboring_returns_neighbors_with_pegged_alignment(self):
# arrange
a_info = AlignmentInfo(
(0, 3, 2),
(None, 'des', 'œufs', 'verts'),
('UNUSED', 'green', 'eggs'),
[[], [], [2], [1]],
)
ibm_model = IBMModel([])
# act: peg 'eggs' to align with 'œufs'
neighbors = ibm_model.neighboring(a_info, 2)
# assert
neighbor_alignments = set()
for neighbor in neighbors:
neighbor_alignments.add(neighbor.alignment)
expected_alignments = set(
[
# moves
(0, 0, 2),
(0, 1, 2),
(0, 2, 2),
# no swaps
# original alignment
(0, 3, 2),
]
)
self.assertEqual(neighbor_alignments, expected_alignments)
def test_hillclimb(self):
# arrange
initial_alignment = AlignmentInfo((0, 3, 2), None, None, None)
def neighboring_mock(a, j):
if a.alignment == (0, 3, 2):
return set(
[
AlignmentInfo((0, 2, 2), None, None, None),
AlignmentInfo((0, 1, 1), None, None, None),
]
)
elif a.alignment == (0, 2, 2):
return set(
[
AlignmentInfo((0, 3, 3), None, None, None),
AlignmentInfo((0, 4, 4), None, None, None),
]
)
return set()
def prob_t_a_given_s_mock(a):
prob_values = {
(0, 3, 2): 0.5,
(0, 2, 2): 0.6,
(0, 1, 1): 0.4,
(0, 3, 3): 0.6,
(0, 4, 4): 0.7,
}
return prob_values.get(a.alignment, 0.01)
ibm_model = IBMModel([])
ibm_model.neighboring = neighboring_mock
ibm_model.prob_t_a_given_s = prob_t_a_given_s_mock
# act
best_alignment = ibm_model.hillclimb(initial_alignment)
# assert: hill climbing goes from (0, 3, 2) -> (0, 2, 2) -> (0, 4, 4)
self.assertEqual(best_alignment.alignment, (0, 4, 4))
def test_sample(self):
# arrange
sentence_pair = AlignedSent(
TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
)
ibm_model = IBMModel([])
ibm_model.prob_t_a_given_s = lambda x: 0.001
# act
samples, best_alignment = ibm_model.sample(sentence_pair)
# assert
self.assertEqual(len(samples), 61)

View File

@@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
"""
Tests for NIST translation evaluation metric
"""
import io
import unittest
from nltk.data import find
from nltk.translate.nist_score import sentence_nist, corpus_nist
class TestNIST(unittest.TestCase):
def test_sentence_nist(self):
ref_file = find('models/wmt15_eval/ref.ru')
hyp_file = find('models/wmt15_eval/google.ru')
mteval_output_file = find('models/wmt15_eval/mteval-13a.output')
# Reads the NIST scores from the `mteval-13a.output` file.
# The order of the list corresponds to the order of the ngrams.
with open(mteval_output_file, 'r') as mteval_fin:
# The numbers are located in the last 4th line of the file.
# The first and 2nd item in the list are the score and system names.
mteval_nist_scores = map(float, mteval_fin.readlines()[-4].split()[1:-1])
with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
# Whitespace tokenize the file.
# Note: split() automatically strip().
hypotheses = list(map(lambda x: x.split(), hyp_fin))
# Note that the corpus_bleu input is list of list of references.
references = list(map(lambda x: [x.split()], ref_fin))
# Without smoothing.
for i, mteval_nist in zip(range(1, 10), mteval_nist_scores):
nltk_nist = corpus_nist(references, hypotheses, i)
# Check that the NIST scores difference is less than 0.5
assert abs(mteval_nist - nltk_nist) < 0.05

View File

@@ -0,0 +1,295 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Stack decoder
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Tests for stack decoder
"""
import unittest
from collections import defaultdict
from math import log
from nltk.translate import PhraseTable
from nltk.translate import StackDecoder
from nltk.translate.stack_decoder import _Hypothesis, _Stack
class TestStackDecoder(unittest.TestCase):
def test_find_all_src_phrases(self):
# arrange
phrase_table = TestStackDecoder.create_fake_phrase_table()
stack_decoder = StackDecoder(phrase_table, None)
sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels')
# act
src_phrase_spans = stack_decoder.find_all_src_phrases(sentence)
# assert
self.assertEqual(src_phrase_spans[0], [2]) # 'my hovercraft'
self.assertEqual(src_phrase_spans[1], [2]) # 'hovercraft'
self.assertEqual(src_phrase_spans[2], [3]) # 'is'
self.assertEqual(src_phrase_spans[3], [5, 6]) # 'full of', 'full of eels'
self.assertFalse(src_phrase_spans[4]) # no entry starting with 'of'
self.assertEqual(src_phrase_spans[5], [6]) # 'eels'
def test_distortion_score(self):
# arrange
stack_decoder = StackDecoder(None, None)
stack_decoder.distortion_factor = 0.5
hypothesis = _Hypothesis()
hypothesis.src_phrase_span = (3, 5)
# act
score = stack_decoder.distortion_score(hypothesis, (8, 10))
# assert
expected_score = log(stack_decoder.distortion_factor) * (8 - 5)
self.assertEqual(score, expected_score)
def test_distortion_score_of_first_expansion(self):
# arrange
stack_decoder = StackDecoder(None, None)
stack_decoder.distortion_factor = 0.5
hypothesis = _Hypothesis()
# act
score = stack_decoder.distortion_score(hypothesis, (8, 10))
# assert
# expansion from empty hypothesis always has zero distortion cost
self.assertEqual(score, 0.0)
def test_compute_future_costs(self):
# arrange
phrase_table = TestStackDecoder.create_fake_phrase_table()
language_model = TestStackDecoder.create_fake_language_model()
stack_decoder = StackDecoder(phrase_table, language_model)
sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels')
# act
future_scores = stack_decoder.compute_future_scores(sentence)
# assert
self.assertEqual(
future_scores[1][2],
(
phrase_table.translations_for(('hovercraft',))[0].log_prob
+ language_model.probability(('hovercraft',))
),
)
self.assertEqual(
future_scores[0][2],
(
phrase_table.translations_for(('my', 'hovercraft'))[0].log_prob
+ language_model.probability(('my', 'hovercraft'))
),
)
def test_compute_future_costs_for_phrases_not_in_phrase_table(self):
# arrange
phrase_table = TestStackDecoder.create_fake_phrase_table()
language_model = TestStackDecoder.create_fake_language_model()
stack_decoder = StackDecoder(phrase_table, language_model)
sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels')
# act
future_scores = stack_decoder.compute_future_scores(sentence)
# assert
self.assertEqual(
future_scores[1][3], # 'hovercraft is' is not in phrase table
future_scores[1][2] + future_scores[2][3],
) # backoff
def test_future_score(self):
# arrange: sentence with 8 words; words 2, 3, 4 already translated
hypothesis = _Hypothesis()
hypothesis.untranslated_spans = lambda _: [(0, 2), (5, 8)] # mock
future_score_table = defaultdict(lambda: defaultdict(float))
future_score_table[0][2] = 0.4
future_score_table[5][8] = 0.5
stack_decoder = StackDecoder(None, None)
# act
future_score = stack_decoder.future_score(hypothesis, future_score_table, 8)
# assert
self.assertEqual(future_score, 0.4 + 0.5)
def test_valid_phrases(self):
# arrange
hypothesis = _Hypothesis()
# mock untranslated_spans method
hypothesis.untranslated_spans = lambda _: [(0, 2), (3, 6)]
all_phrases_from = [[1, 4], [2], [], [5], [5, 6, 7], [], [7]]
# act
phrase_spans = StackDecoder.valid_phrases(all_phrases_from, hypothesis)
# assert
self.assertEqual(phrase_spans, [(0, 1), (1, 2), (3, 5), (4, 5), (4, 6)])
@staticmethod
def create_fake_phrase_table():
phrase_table = PhraseTable()
phrase_table.add(('hovercraft',), ('',), 0.8)
phrase_table.add(('my', 'hovercraft'), ('', ''), 0.7)
phrase_table.add(('my', 'cheese'), ('', ''), 0.7)
phrase_table.add(('is',), ('',), 0.8)
phrase_table.add(('is',), ('',), 0.5)
phrase_table.add(('full', 'of'), ('', ''), 0.01)
phrase_table.add(('full', 'of', 'eels'), ('', '', ''), 0.5)
phrase_table.add(('full', 'of', 'spam'), ('', ''), 0.5)
phrase_table.add(('eels',), ('',), 0.5)
phrase_table.add(('spam',), ('',), 0.5)
return phrase_table
@staticmethod
def create_fake_language_model():
# nltk.model should be used here once it is implemented
language_prob = defaultdict(lambda: -999.0)
language_prob[('my',)] = log(0.1)
language_prob[('hovercraft',)] = log(0.1)
language_prob[('is',)] = log(0.1)
language_prob[('full',)] = log(0.1)
language_prob[('of',)] = log(0.1)
language_prob[('eels',)] = log(0.1)
language_prob[('my', 'hovercraft')] = log(0.3)
language_model = type(
'', (object,), {'probability': lambda _, phrase: language_prob[phrase]}
)()
return language_model
class TestHypothesis(unittest.TestCase):
def setUp(self):
root = _Hypothesis()
child = _Hypothesis(
raw_score=0.5,
src_phrase_span=(3, 7),
trg_phrase=('hello', 'world'),
previous=root,
)
grandchild = _Hypothesis(
raw_score=0.4,
src_phrase_span=(1, 2),
trg_phrase=('and', 'goodbye'),
previous=child,
)
self.hypothesis_chain = grandchild
def test_translation_so_far(self):
# act
translation = self.hypothesis_chain.translation_so_far()
# assert
self.assertEqual(translation, ['hello', 'world', 'and', 'goodbye'])
def test_translation_so_far_for_empty_hypothesis(self):
# arrange
hypothesis = _Hypothesis()
# act
translation = hypothesis.translation_so_far()
# assert
self.assertEqual(translation, [])
def test_total_translated_words(self):
# act
total_translated_words = self.hypothesis_chain.total_translated_words()
# assert
self.assertEqual(total_translated_words, 5)
def test_translated_positions(self):
# act
translated_positions = self.hypothesis_chain.translated_positions()
# assert
translated_positions.sort()
self.assertEqual(translated_positions, [1, 3, 4, 5, 6])
def test_untranslated_spans(self):
# act
untranslated_spans = self.hypothesis_chain.untranslated_spans(10)
# assert
self.assertEqual(untranslated_spans, [(0, 1), (2, 3), (7, 10)])
def test_untranslated_spans_for_empty_hypothesis(self):
# arrange
hypothesis = _Hypothesis()
# act
untranslated_spans = hypothesis.untranslated_spans(10)
# assert
self.assertEqual(untranslated_spans, [(0, 10)])
class TestStack(unittest.TestCase):
def test_push_bumps_off_worst_hypothesis_when_stack_is_full(self):
# arrange
stack = _Stack(3)
poor_hypothesis = _Hypothesis(0.01)
# act
stack.push(_Hypothesis(0.2))
stack.push(poor_hypothesis)
stack.push(_Hypothesis(0.1))
stack.push(_Hypothesis(0.3))
# assert
self.assertFalse(poor_hypothesis in stack)
def test_push_removes_hypotheses_that_fall_below_beam_threshold(self):
# arrange
stack = _Stack(3, 0.5)
poor_hypothesis = _Hypothesis(0.01)
worse_hypothesis = _Hypothesis(0.009)
# act
stack.push(poor_hypothesis)
stack.push(worse_hypothesis)
stack.push(_Hypothesis(0.9)) # greatly superior hypothesis
# assert
self.assertFalse(poor_hypothesis in stack)
self.assertFalse(worse_hypothesis in stack)
def test_push_does_not_add_hypothesis_that_falls_below_beam_threshold(self):
# arrange
stack = _Stack(3, 0.5)
poor_hypothesis = _Hypothesis(0.01)
# act
stack.push(_Hypothesis(0.9)) # greatly superior hypothesis
stack.push(poor_hypothesis)
# assert
self.assertFalse(poor_hypothesis in stack)
def test_best_returns_the_best_hypothesis(self):
# arrange
stack = _Stack(3)
best_hypothesis = _Hypothesis(0.99)
# act
stack.push(_Hypothesis(0.0))
stack.push(best_hypothesis)
stack.push(_Hypothesis(0.5))
# assert
self.assertEqual(stack.best(), best_hypothesis)
def test_best_returns_none_when_stack_is_empty(self):
# arrange
stack = _Stack(3)
# assert
self.assertEqual(stack.best(), None)

View File

@@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from unittest import TestCase
from functools import wraps
from nose.plugins.skip import SkipTest
from nltk.util import py26
def skip(reason):
"""
Unconditionally skip a test.
"""
def decorator(test_item):
is_test_class = isinstance(test_item, type) and issubclass(test_item, TestCase)
if is_test_class and py26():
# Patch all test_ methods to raise SkipText exception.
# This is necessary for Python 2.6 because its unittest
# doesn't understand __unittest_skip__.
for meth_name in (m for m in dir(test_item) if m.startswith('test_')):
patched_method = skip(reason)(getattr(test_item, meth_name))
setattr(test_item, meth_name, patched_method)
if not is_test_class:
@wraps(test_item)
def skip_wrapper(*args, **kwargs):
raise SkipTest(reason)
skip_wrapper.__name__ = test_item.__name__
test_item = skip_wrapper
test_item.__unittest_skip__ = True
test_item.__unittest_skip_why__ = reason
return test_item
return decorator
def skipIf(condition, reason):
"""
Skip a test if the condition is true.
"""
if condition:
return skip(reason)
return lambda obj: obj