Initial commit

This commit is contained in:
Senad Uka
2019-10-20 13:16:49 +02:00
commit 233066caf4
2099 changed files with 360824 additions and 0 deletions

View File

@@ -0,0 +1,271 @@
# -*- coding: utf-8 -*-
"""
Tests for BLEU translation evaluation metric
"""
import functools
import io
import unittest
from nltk.data import find
from nltk.translate.bleu_score import (
modified_precision,
brevity_penalty,
closest_ref_length,
)
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
class TestBLEU(unittest.TestCase):
def test_modified_precision(self):
"""
Examples from the original BLEU paper
http://www.aclweb.org/anthology/P02-1040.pdf
"""
# Example 1: the "the*" example.
# Reference sentences.
ref1 = 'the cat is on the mat'.split()
ref2 = 'there is a cat on the mat'.split()
# Hypothesis sentence(s).
hyp1 = 'the the the the the the the'.split()
references = [ref1, ref2]
# Testing modified unigram precision.
hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
assert round(hyp1_unigram_precision, 4) == 0.2857
# With assertAlmostEqual at 4 place precision.
self.assertAlmostEqual(hyp1_unigram_precision, 0.28571428, places=4)
# Testing modified bigram precision.
assert float(modified_precision(references, hyp1, n=2)) == 0.0
# Example 2: the "of the" example.
# Reference sentences
ref1 = str(
'It is a guide to action that ensures that the military '
'will forever heed Party commands'
).split()
ref2 = str(
'It is the guiding principle which guarantees the military '
'forces always being under the command of the Party'
).split()
ref3 = str(
'It is the practical guide for the army always to heed '
'the directions of the party'
).split()
# Hypothesis sentence(s).
hyp1 = 'of the'.split()
references = [ref1, ref2, ref3]
# Testing modified unigram precision.
assert float(modified_precision(references, hyp1, n=1)) == 1.0
# Testing modified bigram precision.
assert float(modified_precision(references, hyp1, n=2)) == 1.0
# Example 3: Proper MT outputs.
hyp1 = str(
'It is a guide to action which ensures that the military '
'always obeys the commands of the party'
).split()
hyp2 = str(
'It is to insure the troops forever hearing the activity '
'guidebook that party direct'
).split()
references = [ref1, ref2, ref3]
# Unigram precision.
hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
hyp2_unigram_precision = float(modified_precision(references, hyp2, n=1))
# Test unigram precision with assertAlmostEqual at 4 place precision.
self.assertAlmostEqual(hyp1_unigram_precision, 0.94444444, places=4)
self.assertAlmostEqual(hyp2_unigram_precision, 0.57142857, places=4)
# Test unigram precision with rounding.
assert round(hyp1_unigram_precision, 4) == 0.9444
assert round(hyp2_unigram_precision, 4) == 0.5714
# Bigram precision
hyp1_bigram_precision = float(modified_precision(references, hyp1, n=2))
hyp2_bigram_precision = float(modified_precision(references, hyp2, n=2))
# Test bigram precision with assertAlmostEqual at 4 place precision.
self.assertAlmostEqual(hyp1_bigram_precision, 0.58823529, places=4)
self.assertAlmostEqual(hyp2_bigram_precision, 0.07692307, places=4)
# Test bigram precision with rounding.
assert round(hyp1_bigram_precision, 4) == 0.5882
assert round(hyp2_bigram_precision, 4) == 0.0769
def test_brevity_penalty(self):
# Test case from brevity_penalty_closest function in mteval-v13a.pl.
# Same test cases as in the doctest in nltk.translate.bleu_score.py
references = [['a'] * 11, ['a'] * 8]
hypothesis = ['a'] * 7
hyp_len = len(hypothesis)
closest_ref_len = closest_ref_length(references, hyp_len)
self.assertAlmostEqual(
brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4
)
references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
hypothesis = ['a'] * 7
hyp_len = len(hypothesis)
closest_ref_len = closest_ref_length(references, hyp_len)
assert brevity_penalty(closest_ref_len, hyp_len) == 1.0
def test_zero_matches(self):
# Test case where there's 0 matches
references = ['The candidate has no alignment to any of the references'.split()]
hypothesis = 'John loves Mary'.split()
# Test BLEU to nth order of n-grams, where n is len(hypothesis).
for n in range(1, len(hypothesis)):
weights = [1.0 / n] * n # Uniform weights.
assert sentence_bleu(references, hypothesis, weights) == 0
def test_full_matches(self):
# Test case where there's 100% matches
references = ['John loves Mary'.split()]
hypothesis = 'John loves Mary'.split()
# Test BLEU to nth order of n-grams, where n is len(hypothesis).
for n in range(1, len(hypothesis)):
weights = [1.0 / n] * n # Uniform weights.
assert sentence_bleu(references, hypothesis, weights) == 1.0
def test_partial_matches_hypothesis_longer_than_reference(self):
references = ['John loves Mary'.split()]
hypothesis = 'John loves Mary who loves Mike'.split()
# Since no 4-grams matches were found the result should be zero
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
# Checks that the warning has been raised because len(reference) < 4.
try:
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
except AttributeError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
# @unittest.skip("Skipping fringe cases for BLEU.")
class TestBLEUFringeCases(unittest.TestCase):
def test_case_where_n_is_bigger_than_hypothesis_length(self):
# Test BLEU to nth order of n-grams, where n > len(hypothesis).
references = ['John loves Mary ?'.split()]
hypothesis = 'John loves Mary'.split()
n = len(hypothesis) + 1 #
weights = [1.0 / n] * n # Uniform weights.
# Since no n-grams matches were found the result should be zero
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(
sentence_bleu(references, hypothesis, weights), 0.0, places=4
)
# Checks that the warning has been raised because len(hypothesis) < 4.
try:
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
except AttributeError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
# Test case where n > len(hypothesis) but so is n > len(reference), and
# it's a special case where reference == hypothesis.
references = ['John loves Mary'.split()]
hypothesis = 'John loves Mary'.split()
# Since no 4-grams matches were found the result should be zero
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(
sentence_bleu(references, hypothesis, weights), 0.0, places=4
)
def test_empty_hypothesis(self):
# Test case where there's hypothesis is empty.
references = ['The candidate has no alignment to any of the references'.split()]
hypothesis = []
assert sentence_bleu(references, hypothesis) == 0
def test_empty_references(self):
# Test case where there's reference is empty.
references = [[]]
hypothesis = 'John loves Mary'.split()
assert sentence_bleu(references, hypothesis) == 0
def test_empty_references_and_hypothesis(self):
# Test case where both references and hypothesis is empty.
references = [[]]
hypothesis = []
assert sentence_bleu(references, hypothesis) == 0
def test_reference_or_hypothesis_shorter_than_fourgrams(self):
# Tese case where the length of reference or hypothesis
# is shorter than 4.
references = ['let it go'.split()]
hypothesis = 'let go it'.split()
# Checks that the value the hypothesis and reference returns is 0.0
# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
# Checks that the warning has been raised.
try:
self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
except AttributeError:
pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
class TestBLEUvsMteval13a(unittest.TestCase):
def test_corpus_bleu(self):
ref_file = find('models/wmt15_eval/ref.ru')
hyp_file = find('models/wmt15_eval/google.ru')
mteval_output_file = find('models/wmt15_eval/mteval-13a.output')
# Reads the BLEU scores from the `mteval-13a.output` file.
# The order of the list corresponds to the order of the ngrams.
with open(mteval_output_file, 'r') as mteval_fin:
# The numbers are located in the last 2nd line of the file.
# The first and 2nd item in the list are the score and system names.
mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])
with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
# Whitespace tokenize the file.
# Note: split() automatically strip().
hypothesis = list(map(lambda x: x.split(), hyp_fin))
# Note that the corpus_bleu input is list of list of references.
references = list(map(lambda x: [x.split()], ref_fin))
# Without smoothing.
for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
nltk_bleu = corpus_bleu(
references, hypothesis, weights=(1.0 / i,) * i
)
# Check that the BLEU scores difference is less than 0.005 .
# Note: This is an approximate comparison; as much as
# +/- 0.01 BLEU might be "statistically significant",
# the actual translation quality might not be.
assert abs(mteval_bleu - nltk_bleu) < 0.005
# With the same smoothing method used in mteval-v13a.pl
chencherry = SmoothingFunction()
for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
nltk_bleu = corpus_bleu(
references,
hypothesis,
weights=(1.0 / i,) * i,
smoothing_function=chencherry.method3,
)
assert abs(mteval_bleu - nltk_bleu) < 0.005
class TestBLEUWithBadSentence(unittest.TestCase):
def test_corpus_bleu_with_bad_sentence(self):
hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R"
ref = str(
"Their tasks include changing a pump on the faulty stokehold ."
"Likewise , two species that are very similar in morphology "
"were distinguished using genetics ."
)
references = [[ref.split()]]
hypotheses = [hyp.split()]
try: # Check that the warning is raised since no. of 2-grams < 0.
with self.assertWarns(UserWarning):
# Verify that the BLEU output is undesired since no. of 2-grams < 0.
self.assertAlmostEqual(
corpus_bleu(references, hypotheses), 0.0, places=4
)
except AttributeError: # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4)

View File

@@ -0,0 +1,157 @@
# -*- coding: utf-8 -*-
"""
Tests GDFA alignments
"""
import functools
import io
import unittest
from nltk.translate.gdfa import grow_diag_final_and
class TestGDFA(unittest.TestCase):
def test_from_eflomal_outputs(self):
"""
Testing GDFA with first 10 eflomal outputs from issue #1829
https://github.com/nltk/nltk/issues/1829
"""
# Input.
forwards = [
'0-0 1-2',
'0-0 1-1',
'0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 7-8 9-9 10-10 9-11 11-12 12-13 13-14',
'0-0 1-1 1-2 2-3 3-4 4-5 4-6 5-7 6-8 8-9 9-10',
'0-0 14-1 15-2 16-3 20-5 21-6 22-7 5-8 6-9 7-10 8-11 9-12 10-13 11-14 12-15 13-16 14-17 17-18 18-19 19-20 20-21 23-22 24-23 25-24 26-25 27-27 28-28 29-29 30-30 31-31',
'0-0 1-1 0-2 2-3',
'0-0 2-2 4-4',
'0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-20',
'3-0 4-1 6-2 5-3 6-4 7-5 8-6 9-7 10-8 11-9 16-10 9-12 10-13 12-14',
'1-0',
]
backwards = [
'0-0 1-2',
'0-0 1-1',
'0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 9-8 10-10 11-12 12-11 13-13',
'0-0 1-2 2-3 3-4 4-6 6-8 7-5 8-7 9-8',
'0-0 1-8 2-9 3-10 4-11 5-12 6-11 8-13 9-14 10-15 11-16 12-17 13-18 14-19 15-20 16-21 17-22 18-23 19-24 20-29 21-30 22-31 23-2 24-3 25-4 26-5 27-5 28-6 29-7 30-28 31-31',
'0-0 1-1 2-3',
'0-0 1-1 2-3 4-4',
'0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-16 21-18',
'0-0 1-1 3-2 4-1 5-3 6-4 7-5 8-6 9-7 10-8 11-9 12-8 13-9 14-8 15-9 16-10',
'1-0',
]
source_lens = [2, 3, 3, 15, 11, 33, 4, 6, 23, 18]
target_lens = [2, 4, 3, 16, 12, 33, 5, 6, 22, 16]
# Expected Output.
expected = [
[(0, 0), (1, 2)],
[(0, 0), (1, 1)],
[
(0, 0),
(2, 1),
(3, 2),
(4, 3),
(5, 4),
(6, 5),
(7, 6),
(8, 7),
(10, 10),
(11, 12),
],
[
(0, 0),
(1, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(4, 6),
(5, 7),
(6, 8),
(7, 5),
(8, 7),
(8, 9),
(9, 8),
(9, 10),
],
[
(0, 0),
(1, 8),
(2, 9),
(3, 10),
(4, 11),
(5, 8),
(6, 9),
(6, 11),
(7, 10),
(8, 11),
(31, 31),
],
[(0, 0), (0, 2), (1, 1), (2, 3)],
[(0, 0), (1, 1), (2, 2), (2, 3), (4, 4)],
[
(0, 0),
(1, 1),
(2, 3),
(3, 4),
(5, 5),
(7, 6),
(8, 7),
(9, 8),
(10, 9),
(11, 10),
(12, 11),
(13, 12),
(14, 13),
(15, 14),
(16, 16),
(17, 17),
(18, 18),
(19, 19),
],
[
(0, 0),
(1, 1),
(3, 0),
(3, 2),
(4, 1),
(5, 3),
(6, 2),
(6, 4),
(7, 5),
(8, 6),
(9, 7),
(9, 12),
(10, 8),
(10, 13),
(11, 9),
(12, 8),
(12, 14),
(13, 9),
(14, 8),
(15, 9),
(16, 10),
],
[(1, 0)],
[
(0, 0),
(1, 1),
(3, 2),
(4, 3),
(5, 4),
(6, 5),
(7, 6),
(9, 10),
(10, 12),
(11, 13),
(12, 14),
(13, 15),
],
]
# Iterate through all 10 examples and check for expected outputs.
for fw, bw, src_len, trg_len, expect in zip(
forwards, backwards, source_lens, target_lens, expected
):
self.assertListEqual(expect, grow_diag_final_and(src_len, trg_len, fw, bw))

View File

@@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
"""
Tests for IBM Model 1 training methods
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent
from nltk.translate import IBMModel
from nltk.translate import IBMModel1
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel1(unittest.TestCase):
def test_set_uniform_translation_probabilities(self):
# arrange
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model1 = IBMModel1(corpus, 0)
# act
model1.set_uniform_probabilities(corpus)
# assert
# expected_prob = 1.0 / (target vocab size + 1)
self.assertEqual(model1.translation_table['ham']['eier'], 1.0 / 3)
self.assertEqual(model1.translation_table['eggs'][None], 1.0 / 3)
def test_set_uniform_translation_probabilities_of_non_domain_values(self):
# arrange
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model1 = IBMModel1(corpus, 0)
# act
model1.set_uniform_probabilities(corpus)
# assert
# examine target words that are not in the training data domain
self.assertEqual(model1.translation_table['parrot']['eier'], IBMModel.MIN_PROB)
def test_prob_t_a_given_s(self):
# arrange
src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
corpus = [AlignedSent(trg_sentence, src_sentence)]
alignment_info = AlignmentInfo(
(0, 1, 4, 0, 2, 5, 5),
[None] + src_sentence,
['UNUSED'] + trg_sentence,
None,
)
translation_table = defaultdict(lambda: defaultdict(float))
translation_table['i']['ich'] = 0.98
translation_table['love']['gern'] = 0.98
translation_table['to'][None] = 0.98
translation_table['eat']['esse'] = 0.98
translation_table['smoked']['räucherschinken'] = 0.98
translation_table['ham']['räucherschinken'] = 0.98
model1 = IBMModel1(corpus, 0)
model1.translation_table = translation_table
# act
probability = model1.prob_t_a_given_s(alignment_info)
# assert
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
expected_probability = lexical_translation
self.assertEqual(round(probability, 4), round(expected_probability, 4))

View File

@@ -0,0 +1,89 @@
# -*- coding: utf-8 -*-
"""
Tests for IBM Model 2 training methods
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent
from nltk.translate import IBMModel
from nltk.translate import IBMModel2
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel2(unittest.TestCase):
def test_set_uniform_alignment_probabilities(self):
# arrange
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model2 = IBMModel2(corpus, 0)
# act
model2.set_uniform_probabilities(corpus)
# assert
# expected_prob = 1.0 / (length of source sentence + 1)
self.assertEqual(model2.alignment_table[0][1][3][2], 1.0 / 4)
self.assertEqual(model2.alignment_table[2][4][2][4], 1.0 / 3)
def test_set_uniform_alignment_probabilities_of_non_domain_values(self):
# arrange
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model2 = IBMModel2(corpus, 0)
# act
model2.set_uniform_probabilities(corpus)
# assert
# examine i and j values that are not in the training data domain
self.assertEqual(model2.alignment_table[99][1][3][2], IBMModel.MIN_PROB)
self.assertEqual(model2.alignment_table[2][99][2][4], IBMModel.MIN_PROB)
def test_prob_t_a_given_s(self):
# arrange
src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
corpus = [AlignedSent(trg_sentence, src_sentence)]
alignment_info = AlignmentInfo(
(0, 1, 4, 0, 2, 5, 5),
[None] + src_sentence,
['UNUSED'] + trg_sentence,
None,
)
translation_table = defaultdict(lambda: defaultdict(float))
translation_table['i']['ich'] = 0.98
translation_table['love']['gern'] = 0.98
translation_table['to'][None] = 0.98
translation_table['eat']['esse'] = 0.98
translation_table['smoked']['räucherschinken'] = 0.98
translation_table['ham']['räucherschinken'] = 0.98
alignment_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
)
alignment_table[0][3][5][6] = 0.97 # None -> to
alignment_table[1][1][5][6] = 0.97 # ich -> i
alignment_table[2][4][5][6] = 0.97 # esse -> eat
alignment_table[4][2][5][6] = 0.97 # gern -> love
alignment_table[5][5][5][6] = 0.96 # räucherschinken -> smoked
alignment_table[5][6][5][6] = 0.96 # räucherschinken -> ham
model2 = IBMModel2(corpus, 0)
model2.translation_table = translation_table
model2.alignment_table = alignment_table
# act
probability = model2.prob_t_a_given_s(alignment_info)
# assert
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
alignment = 0.97 * 0.97 * 0.97 * 0.97 * 0.96 * 0.96
expected_probability = lexical_translation * alignment
self.assertEqual(round(probability, 4), round(expected_probability, 4))

View File

@@ -0,0 +1,108 @@
# -*- coding: utf-8 -*-
"""
Tests for IBM Model 3 training methods
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent
from nltk.translate import IBMModel
from nltk.translate import IBMModel3
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel3(unittest.TestCase):
def test_set_uniform_distortion_probabilities(self):
# arrange
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model3 = IBMModel3(corpus, 0)
# act
model3.set_uniform_probabilities(corpus)
# assert
# expected_prob = 1.0 / length of target sentence
self.assertEqual(model3.distortion_table[1][0][3][2], 1.0 / 2)
self.assertEqual(model3.distortion_table[4][2][2][4], 1.0 / 4)
def test_set_uniform_distortion_probabilities_of_non_domain_values(self):
# arrange
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model3 = IBMModel3(corpus, 0)
# act
model3.set_uniform_probabilities(corpus)
# assert
# examine i and j values that are not in the training data domain
self.assertEqual(model3.distortion_table[0][0][3][2], IBMModel.MIN_PROB)
self.assertEqual(model3.distortion_table[9][2][2][4], IBMModel.MIN_PROB)
self.assertEqual(model3.distortion_table[2][9][2][4], IBMModel.MIN_PROB)
def test_prob_t_a_given_s(self):
# arrange
src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
corpus = [AlignedSent(trg_sentence, src_sentence)]
alignment_info = AlignmentInfo(
(0, 1, 4, 0, 2, 5, 5),
[None] + src_sentence,
['UNUSED'] + trg_sentence,
[[3], [1], [4], [], [2], [5, 6]],
)
distortion_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
)
distortion_table[1][1][5][6] = 0.97 # i -> ich
distortion_table[2][4][5][6] = 0.97 # love -> gern
distortion_table[3][0][5][6] = 0.97 # to -> NULL
distortion_table[4][2][5][6] = 0.97 # eat -> esse
distortion_table[5][5][5][6] = 0.97 # smoked -> räucherschinken
distortion_table[6][5][5][6] = 0.97 # ham -> räucherschinken
translation_table = defaultdict(lambda: defaultdict(float))
translation_table['i']['ich'] = 0.98
translation_table['love']['gern'] = 0.98
translation_table['to'][None] = 0.98
translation_table['eat']['esse'] = 0.98
translation_table['smoked']['räucherschinken'] = 0.98
translation_table['ham']['räucherschinken'] = 0.98
fertility_table = defaultdict(lambda: defaultdict(float))
fertility_table[1]['ich'] = 0.99
fertility_table[1]['esse'] = 0.99
fertility_table[0]['ja'] = 0.99
fertility_table[1]['gern'] = 0.99
fertility_table[2]['räucherschinken'] = 0.999
fertility_table[1][None] = 0.99
probabilities = {
'p1': 0.167,
'translation_table': translation_table,
'distortion_table': distortion_table,
'fertility_table': fertility_table,
'alignment_table': None,
}
model3 = IBMModel3(corpus, 0, probabilities)
# act
probability = model3.prob_t_a_given_s(alignment_info)
# assert
null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
distortion = 0.97 * 0.97 * 0.97 * 0.97 * 0.97 * 0.97
expected_probability = (
null_generation * fertility * lexical_translation * distortion
)
self.assertEqual(round(probability, 4), round(expected_probability, 4))

View File

@@ -0,0 +1,123 @@
# -*- coding: utf-8 -*-
"""
Tests for IBM Model 4 training methods
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent
from nltk.translate import IBMModel
from nltk.translate import IBMModel4
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel4(unittest.TestCase):
def test_set_uniform_distortion_probabilities_of_max_displacements(self):
# arrange
src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model4 = IBMModel4(corpus, 0, src_classes, trg_classes)
# act
model4.set_uniform_probabilities(corpus)
# assert
# number of displacement values =
# 2 *(number of words in longest target sentence - 1)
expected_prob = 1.0 / (2 * (4 - 1))
# examine the boundary values for (displacement, src_class, trg_class)
self.assertEqual(model4.head_distortion_table[3][0][0], expected_prob)
self.assertEqual(model4.head_distortion_table[-3][1][2], expected_prob)
self.assertEqual(model4.non_head_distortion_table[3][0], expected_prob)
self.assertEqual(model4.non_head_distortion_table[-3][2], expected_prob)
def test_set_uniform_distortion_probabilities_of_non_domain_values(self):
# arrange
src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model4 = IBMModel4(corpus, 0, src_classes, trg_classes)
# act
model4.set_uniform_probabilities(corpus)
# assert
# examine displacement values that are not in the training data domain
self.assertEqual(model4.head_distortion_table[4][0][0], IBMModel.MIN_PROB)
self.assertEqual(model4.head_distortion_table[100][1][2], IBMModel.MIN_PROB)
self.assertEqual(model4.non_head_distortion_table[4][0], IBMModel.MIN_PROB)
self.assertEqual(model4.non_head_distortion_table[100][2], IBMModel.MIN_PROB)
def test_prob_t_a_given_s(self):
# arrange
src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
src_classes = {'räucherschinken': 0, 'ja': 1, 'ich': 2, 'esse': 3, 'gern': 4}
trg_classes = {'ham': 0, 'smoked': 1, 'i': 3, 'love': 4, 'to': 2, 'eat': 4}
corpus = [AlignedSent(trg_sentence, src_sentence)]
alignment_info = AlignmentInfo(
(0, 1, 4, 0, 2, 5, 5),
[None] + src_sentence,
['UNUSED'] + trg_sentence,
[[3], [1], [4], [], [2], [5, 6]],
)
head_distortion_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(float))
)
head_distortion_table[1][None][3] = 0.97 # None, i
head_distortion_table[3][2][4] = 0.97 # ich, eat
head_distortion_table[-2][3][4] = 0.97 # esse, love
head_distortion_table[3][4][1] = 0.97 # gern, smoked
non_head_distortion_table = defaultdict(lambda: defaultdict(float))
non_head_distortion_table[1][0] = 0.96 # ham
translation_table = defaultdict(lambda: defaultdict(float))
translation_table['i']['ich'] = 0.98
translation_table['love']['gern'] = 0.98
translation_table['to'][None] = 0.98
translation_table['eat']['esse'] = 0.98
translation_table['smoked']['räucherschinken'] = 0.98
translation_table['ham']['räucherschinken'] = 0.98
fertility_table = defaultdict(lambda: defaultdict(float))
fertility_table[1]['ich'] = 0.99
fertility_table[1]['esse'] = 0.99
fertility_table[0]['ja'] = 0.99
fertility_table[1]['gern'] = 0.99
fertility_table[2]['räucherschinken'] = 0.999
fertility_table[1][None] = 0.99
probabilities = {
'p1': 0.167,
'translation_table': translation_table,
'head_distortion_table': head_distortion_table,
'non_head_distortion_table': non_head_distortion_table,
'fertility_table': fertility_table,
'alignment_table': None,
}
model4 = IBMModel4(corpus, 0, src_classes, trg_classes, probabilities)
# act
probability = model4.prob_t_a_given_s(alignment_info)
# assert
null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
distortion = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96
expected_probability = (
null_generation * fertility * lexical_translation * distortion
)
self.assertEqual(round(probability, 4), round(expected_probability, 4))

View File

@@ -0,0 +1,164 @@
# -*- coding: utf-8 -*-
"""
Tests for IBM Model 5 training methods
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent
from nltk.translate import IBMModel
from nltk.translate import IBMModel4
from nltk.translate import IBMModel5
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel5(unittest.TestCase):
def test_set_uniform_vacancy_probabilities_of_max_displacements(self):
# arrange
src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model5 = IBMModel5(corpus, 0, src_classes, trg_classes)
# act
model5.set_uniform_probabilities(corpus)
# assert
# number of vacancy difference values =
# 2 * number of words in longest target sentence
expected_prob = 1.0 / (2 * 4)
# examine the boundary values for (dv, max_v, trg_class)
self.assertEqual(model5.head_vacancy_table[4][4][0], expected_prob)
self.assertEqual(model5.head_vacancy_table[-3][1][2], expected_prob)
self.assertEqual(model5.non_head_vacancy_table[4][4][0], expected_prob)
self.assertEqual(model5.non_head_vacancy_table[-3][1][2], expected_prob)
def test_set_uniform_vacancy_probabilities_of_non_domain_values(self):
# arrange
src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
corpus = [
AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
]
model5 = IBMModel5(corpus, 0, src_classes, trg_classes)
# act
model5.set_uniform_probabilities(corpus)
# assert
# examine dv and max_v values that are not in the training data domain
self.assertEqual(model5.head_vacancy_table[5][4][0], IBMModel.MIN_PROB)
self.assertEqual(model5.head_vacancy_table[-4][1][2], IBMModel.MIN_PROB)
self.assertEqual(model5.head_vacancy_table[4][0][0], IBMModel.MIN_PROB)
self.assertEqual(model5.non_head_vacancy_table[5][4][0], IBMModel.MIN_PROB)
self.assertEqual(model5.non_head_vacancy_table[-4][1][2], IBMModel.MIN_PROB)
def test_prob_t_a_given_s(self):
# arrange
src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
src_classes = {'räucherschinken': 0, 'ja': 1, 'ich': 2, 'esse': 3, 'gern': 4}
trg_classes = {'ham': 0, 'smoked': 1, 'i': 3, 'love': 4, 'to': 2, 'eat': 4}
corpus = [AlignedSent(trg_sentence, src_sentence)]
alignment_info = AlignmentInfo(
(0, 1, 4, 0, 2, 5, 5),
[None] + src_sentence,
['UNUSED'] + trg_sentence,
[[3], [1], [4], [], [2], [5, 6]],
)
head_vacancy_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(float))
)
head_vacancy_table[1 - 0][6][3] = 0.97 # ich -> i
head_vacancy_table[3 - 0][5][4] = 0.97 # esse -> eat
head_vacancy_table[1 - 2][4][4] = 0.97 # gern -> love
head_vacancy_table[2 - 0][2][1] = 0.97 # räucherschinken -> smoked
non_head_vacancy_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(float))
)
non_head_vacancy_table[1 - 0][1][0] = 0.96 # räucherschinken -> ham
translation_table = defaultdict(lambda: defaultdict(float))
translation_table['i']['ich'] = 0.98
translation_table['love']['gern'] = 0.98
translation_table['to'][None] = 0.98
translation_table['eat']['esse'] = 0.98
translation_table['smoked']['räucherschinken'] = 0.98
translation_table['ham']['räucherschinken'] = 0.98
fertility_table = defaultdict(lambda: defaultdict(float))
fertility_table[1]['ich'] = 0.99
fertility_table[1]['esse'] = 0.99
fertility_table[0]['ja'] = 0.99
fertility_table[1]['gern'] = 0.99
fertility_table[2]['räucherschinken'] = 0.999
fertility_table[1][None] = 0.99
probabilities = {
'p1': 0.167,
'translation_table': translation_table,
'fertility_table': fertility_table,
'head_vacancy_table': head_vacancy_table,
'non_head_vacancy_table': non_head_vacancy_table,
'head_distortion_table': None,
'non_head_distortion_table': None,
'alignment_table': None,
}
model5 = IBMModel5(corpus, 0, src_classes, trg_classes, probabilities)
# act
probability = model5.prob_t_a_given_s(alignment_info)
# assert
null_generation = 5 * pow(0.167, 1) * pow(0.833, 4)
fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999
lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
vacancy = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96
expected_probability = (
null_generation * fertility * lexical_translation * vacancy
)
self.assertEqual(round(probability, 4), round(expected_probability, 4))
def test_prune(self):
# arrange
alignment_infos = [
AlignmentInfo((1, 1), None, None, None),
AlignmentInfo((1, 2), None, None, None),
AlignmentInfo((2, 1), None, None, None),
AlignmentInfo((2, 2), None, None, None),
AlignmentInfo((0, 0), None, None, None),
]
min_factor = IBMModel5.MIN_SCORE_FACTOR
best_score = 0.9
scores = {
(1, 1): min(min_factor * 1.5, 1) * best_score, # above threshold
(1, 2): best_score,
(2, 1): min_factor * best_score, # at threshold
(2, 2): min_factor * best_score * 0.5, # low score
(0, 0): min(min_factor * 1.1, 1) * 1.2, # above threshold
}
corpus = [AlignedSent(['a'], ['b'])]
original_prob_function = IBMModel4.model4_prob_t_a_given_s
# mock static method
IBMModel4.model4_prob_t_a_given_s = staticmethod(
lambda a, model: scores[a.alignment]
)
model5 = IBMModel5(corpus, 0, None, None)
# act
pruned_alignments = model5.prune(alignment_infos)
# assert
self.assertEqual(len(pruned_alignments), 3)
# restore static method
IBMModel4.model4_prob_t_a_given_s = original_prob_function

View File

@@ -0,0 +1,279 @@
# -*- coding: utf-8 -*-
"""
Tests for common methods of IBM translation models
"""
import unittest
from collections import defaultdict
from nltk.translate import AlignedSent
from nltk.translate import IBMModel
from nltk.translate.ibm_model import AlignmentInfo
class TestIBMModel(unittest.TestCase):
__TEST_SRC_SENTENCE = ["j'", 'aime', 'bien', 'jambon']
__TEST_TRG_SENTENCE = ['i', 'love', 'ham']
def test_vocabularies_are_initialized(self):
parallel_corpora = [
AlignedSent(['one', 'two', 'three', 'four'], ['un', 'deux', 'trois']),
AlignedSent(['five', 'one', 'six'], ['quatre', 'cinq', 'six']),
AlignedSent([], ['sept']),
]
ibm_model = IBMModel(parallel_corpora)
self.assertEqual(len(ibm_model.src_vocab), 8)
self.assertEqual(len(ibm_model.trg_vocab), 6)
def test_vocabularies_are_initialized_even_with_empty_corpora(self):
parallel_corpora = []
ibm_model = IBMModel(parallel_corpora)
self.assertEqual(len(ibm_model.src_vocab), 1) # addition of NULL token
self.assertEqual(len(ibm_model.trg_vocab), 0)
def test_best_model2_alignment(self):
# arrange
sentence_pair = AlignedSent(
TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
)
# None and 'bien' have zero fertility
translation_table = {
'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0},
'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03},
'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0},
}
alignment_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
)
ibm_model = IBMModel([])
ibm_model.translation_table = translation_table
ibm_model.alignment_table = alignment_table
# act
a_info = ibm_model.best_model2_alignment(sentence_pair)
# assert
self.assertEqual(a_info.alignment[1:], (1, 2, 4)) # 0th element unused
self.assertEqual(a_info.cepts, [[], [1], [2], [], [3]])
def test_best_model2_alignment_does_not_change_pegged_alignment(self):
# arrange
sentence_pair = AlignedSent(
TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
)
translation_table = {
'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0},
'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03},
'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0},
}
alignment_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
)
ibm_model = IBMModel([])
ibm_model.translation_table = translation_table
ibm_model.alignment_table = alignment_table
# act: force 'love' to be pegged to 'jambon'
a_info = ibm_model.best_model2_alignment(sentence_pair, 2, 4)
# assert
self.assertEqual(a_info.alignment[1:], (1, 4, 4))
self.assertEqual(a_info.cepts, [[], [1], [], [], [2, 3]])
def test_best_model2_alignment_handles_fertile_words(self):
# arrange
sentence_pair = AlignedSent(
['i', 'really', ',', 'really', 'love', 'ham'],
TestIBMModel.__TEST_SRC_SENTENCE,
)
# 'bien' produces 2 target words: 'really' and another 'really'
translation_table = {
'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0},
'really': {"j'": 0, 'aime': 0, 'bien': 0.9, 'jambon': 0.01, None: 0.09},
',': {"j'": 0, 'aime': 0, 'bien': 0.3, 'jambon': 0, None: 0.7},
'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03},
'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0},
}
alignment_table = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
)
ibm_model = IBMModel([])
ibm_model.translation_table = translation_table
ibm_model.alignment_table = alignment_table
# act
a_info = ibm_model.best_model2_alignment(sentence_pair)
# assert
self.assertEqual(a_info.alignment[1:], (1, 3, 0, 3, 2, 4))
self.assertEqual(a_info.cepts, [[3], [1], [5], [2, 4], [6]])
def test_best_model2_alignment_handles_empty_src_sentence(self):
# arrange
sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, [])
ibm_model = IBMModel([])
# act
a_info = ibm_model.best_model2_alignment(sentence_pair)
# assert
self.assertEqual(a_info.alignment[1:], (0, 0, 0))
self.assertEqual(a_info.cepts, [[1, 2, 3]])
def test_best_model2_alignment_handles_empty_trg_sentence(self):
# arrange
sentence_pair = AlignedSent([], TestIBMModel.__TEST_SRC_SENTENCE)
ibm_model = IBMModel([])
# act
a_info = ibm_model.best_model2_alignment(sentence_pair)
# assert
self.assertEqual(a_info.alignment[1:], ())
self.assertEqual(a_info.cepts, [[], [], [], [], []])
def test_neighboring_finds_neighbor_alignments(self):
# arrange
a_info = AlignmentInfo(
(0, 3, 2),
(None, 'des', 'œufs', 'verts'),
('UNUSED', 'green', 'eggs'),
[[], [], [2], [1]],
)
ibm_model = IBMModel([])
# act
neighbors = ibm_model.neighboring(a_info)
# assert
neighbor_alignments = set()
for neighbor in neighbors:
neighbor_alignments.add(neighbor.alignment)
expected_alignments = set(
[
# moves
(0, 0, 2),
(0, 1, 2),
(0, 2, 2),
(0, 3, 0),
(0, 3, 1),
(0, 3, 3),
# swaps
(0, 2, 3),
# original alignment
(0, 3, 2),
]
)
self.assertEqual(neighbor_alignments, expected_alignments)
def test_neighboring_sets_neighbor_alignment_info(self):
# arrange
a_info = AlignmentInfo(
(0, 3, 2),
(None, 'des', 'œufs', 'verts'),
('UNUSED', 'green', 'eggs'),
[[], [], [2], [1]],
)
ibm_model = IBMModel([])
# act
neighbors = ibm_model.neighboring(a_info)
# assert: select a few particular alignments
for neighbor in neighbors:
if neighbor.alignment == (0, 2, 2):
moved_alignment = neighbor
elif neighbor.alignment == (0, 3, 2):
swapped_alignment = neighbor
self.assertEqual(moved_alignment.cepts, [[], [], [1, 2], []])
self.assertEqual(swapped_alignment.cepts, [[], [], [2], [1]])
def test_neighboring_returns_neighbors_with_pegged_alignment(self):
# arrange
a_info = AlignmentInfo(
(0, 3, 2),
(None, 'des', 'œufs', 'verts'),
('UNUSED', 'green', 'eggs'),
[[], [], [2], [1]],
)
ibm_model = IBMModel([])
# act: peg 'eggs' to align with 'œufs'
neighbors = ibm_model.neighboring(a_info, 2)
# assert
neighbor_alignments = set()
for neighbor in neighbors:
neighbor_alignments.add(neighbor.alignment)
expected_alignments = set(
[
# moves
(0, 0, 2),
(0, 1, 2),
(0, 2, 2),
# no swaps
# original alignment
(0, 3, 2),
]
)
self.assertEqual(neighbor_alignments, expected_alignments)
def test_hillclimb(self):
# arrange
initial_alignment = AlignmentInfo((0, 3, 2), None, None, None)
def neighboring_mock(a, j):
if a.alignment == (0, 3, 2):
return set(
[
AlignmentInfo((0, 2, 2), None, None, None),
AlignmentInfo((0, 1, 1), None, None, None),
]
)
elif a.alignment == (0, 2, 2):
return set(
[
AlignmentInfo((0, 3, 3), None, None, None),
AlignmentInfo((0, 4, 4), None, None, None),
]
)
return set()
def prob_t_a_given_s_mock(a):
prob_values = {
(0, 3, 2): 0.5,
(0, 2, 2): 0.6,
(0, 1, 1): 0.4,
(0, 3, 3): 0.6,
(0, 4, 4): 0.7,
}
return prob_values.get(a.alignment, 0.01)
ibm_model = IBMModel([])
ibm_model.neighboring = neighboring_mock
ibm_model.prob_t_a_given_s = prob_t_a_given_s_mock
# act
best_alignment = ibm_model.hillclimb(initial_alignment)
# assert: hill climbing goes from (0, 3, 2) -> (0, 2, 2) -> (0, 4, 4)
self.assertEqual(best_alignment.alignment, (0, 4, 4))
def test_sample(self):
# arrange
sentence_pair = AlignedSent(
TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
)
ibm_model = IBMModel([])
ibm_model.prob_t_a_given_s = lambda x: 0.001
# act
samples, best_alignment = ibm_model.sample(sentence_pair)
# assert
self.assertEqual(len(samples), 61)

View File

@@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
"""
Tests for NIST translation evaluation metric
"""
import io
import unittest
from nltk.data import find
from nltk.translate.nist_score import sentence_nist, corpus_nist
class TestNIST(unittest.TestCase):
def test_sentence_nist(self):
ref_file = find('models/wmt15_eval/ref.ru')
hyp_file = find('models/wmt15_eval/google.ru')
mteval_output_file = find('models/wmt15_eval/mteval-13a.output')
# Reads the NIST scores from the `mteval-13a.output` file.
# The order of the list corresponds to the order of the ngrams.
with open(mteval_output_file, 'r') as mteval_fin:
# The numbers are located in the last 4th line of the file.
# The first and 2nd item in the list are the score and system names.
mteval_nist_scores = map(float, mteval_fin.readlines()[-4].split()[1:-1])
with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
# Whitespace tokenize the file.
# Note: split() automatically strip().
hypotheses = list(map(lambda x: x.split(), hyp_fin))
# Note that the corpus_bleu input is list of list of references.
references = list(map(lambda x: [x.split()], ref_fin))
# Without smoothing.
for i, mteval_nist in zip(range(1, 10), mteval_nist_scores):
nltk_nist = corpus_nist(references, hypotheses, i)
# Check that the NIST scores difference is less than 0.5
assert abs(mteval_nist - nltk_nist) < 0.05

View File

@@ -0,0 +1,295 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Stack decoder
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Tah Wei Hoon <hoon.tw@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Tests for stack decoder
"""
import unittest
from collections import defaultdict
from math import log
from nltk.translate import PhraseTable
from nltk.translate import StackDecoder
from nltk.translate.stack_decoder import _Hypothesis, _Stack
class TestStackDecoder(unittest.TestCase):
def test_find_all_src_phrases(self):
# arrange
phrase_table = TestStackDecoder.create_fake_phrase_table()
stack_decoder = StackDecoder(phrase_table, None)
sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels')
# act
src_phrase_spans = stack_decoder.find_all_src_phrases(sentence)
# assert
self.assertEqual(src_phrase_spans[0], [2]) # 'my hovercraft'
self.assertEqual(src_phrase_spans[1], [2]) # 'hovercraft'
self.assertEqual(src_phrase_spans[2], [3]) # 'is'
self.assertEqual(src_phrase_spans[3], [5, 6]) # 'full of', 'full of eels'
self.assertFalse(src_phrase_spans[4]) # no entry starting with 'of'
self.assertEqual(src_phrase_spans[5], [6]) # 'eels'
def test_distortion_score(self):
# arrange
stack_decoder = StackDecoder(None, None)
stack_decoder.distortion_factor = 0.5
hypothesis = _Hypothesis()
hypothesis.src_phrase_span = (3, 5)
# act
score = stack_decoder.distortion_score(hypothesis, (8, 10))
# assert
expected_score = log(stack_decoder.distortion_factor) * (8 - 5)
self.assertEqual(score, expected_score)
def test_distortion_score_of_first_expansion(self):
# arrange
stack_decoder = StackDecoder(None, None)
stack_decoder.distortion_factor = 0.5
hypothesis = _Hypothesis()
# act
score = stack_decoder.distortion_score(hypothesis, (8, 10))
# assert
# expansion from empty hypothesis always has zero distortion cost
self.assertEqual(score, 0.0)
def test_compute_future_costs(self):
# arrange
phrase_table = TestStackDecoder.create_fake_phrase_table()
language_model = TestStackDecoder.create_fake_language_model()
stack_decoder = StackDecoder(phrase_table, language_model)
sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels')
# act
future_scores = stack_decoder.compute_future_scores(sentence)
# assert
self.assertEqual(
future_scores[1][2],
(
phrase_table.translations_for(('hovercraft',))[0].log_prob
+ language_model.probability(('hovercraft',))
),
)
self.assertEqual(
future_scores[0][2],
(
phrase_table.translations_for(('my', 'hovercraft'))[0].log_prob
+ language_model.probability(('my', 'hovercraft'))
),
)
def test_compute_future_costs_for_phrases_not_in_phrase_table(self):
# arrange
phrase_table = TestStackDecoder.create_fake_phrase_table()
language_model = TestStackDecoder.create_fake_language_model()
stack_decoder = StackDecoder(phrase_table, language_model)
sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels')
# act
future_scores = stack_decoder.compute_future_scores(sentence)
# assert
self.assertEqual(
future_scores[1][3], # 'hovercraft is' is not in phrase table
future_scores[1][2] + future_scores[2][3],
) # backoff
def test_future_score(self):
# arrange: sentence with 8 words; words 2, 3, 4 already translated
hypothesis = _Hypothesis()
hypothesis.untranslated_spans = lambda _: [(0, 2), (5, 8)] # mock
future_score_table = defaultdict(lambda: defaultdict(float))
future_score_table[0][2] = 0.4
future_score_table[5][8] = 0.5
stack_decoder = StackDecoder(None, None)
# act
future_score = stack_decoder.future_score(hypothesis, future_score_table, 8)
# assert
self.assertEqual(future_score, 0.4 + 0.5)
def test_valid_phrases(self):
# arrange
hypothesis = _Hypothesis()
# mock untranslated_spans method
hypothesis.untranslated_spans = lambda _: [(0, 2), (3, 6)]
all_phrases_from = [[1, 4], [2], [], [5], [5, 6, 7], [], [7]]
# act
phrase_spans = StackDecoder.valid_phrases(all_phrases_from, hypothesis)
# assert
self.assertEqual(phrase_spans, [(0, 1), (1, 2), (3, 5), (4, 5), (4, 6)])
@staticmethod
def create_fake_phrase_table():
phrase_table = PhraseTable()
phrase_table.add(('hovercraft',), ('',), 0.8)
phrase_table.add(('my', 'hovercraft'), ('', ''), 0.7)
phrase_table.add(('my', 'cheese'), ('', ''), 0.7)
phrase_table.add(('is',), ('',), 0.8)
phrase_table.add(('is',), ('',), 0.5)
phrase_table.add(('full', 'of'), ('', ''), 0.01)
phrase_table.add(('full', 'of', 'eels'), ('', '', ''), 0.5)
phrase_table.add(('full', 'of', 'spam'), ('', ''), 0.5)
phrase_table.add(('eels',), ('',), 0.5)
phrase_table.add(('spam',), ('',), 0.5)
return phrase_table
@staticmethod
def create_fake_language_model():
# nltk.model should be used here once it is implemented
language_prob = defaultdict(lambda: -999.0)
language_prob[('my',)] = log(0.1)
language_prob[('hovercraft',)] = log(0.1)
language_prob[('is',)] = log(0.1)
language_prob[('full',)] = log(0.1)
language_prob[('of',)] = log(0.1)
language_prob[('eels',)] = log(0.1)
language_prob[('my', 'hovercraft')] = log(0.3)
language_model = type(
'', (object,), {'probability': lambda _, phrase: language_prob[phrase]}
)()
return language_model
class TestHypothesis(unittest.TestCase):
def setUp(self):
root = _Hypothesis()
child = _Hypothesis(
raw_score=0.5,
src_phrase_span=(3, 7),
trg_phrase=('hello', 'world'),
previous=root,
)
grandchild = _Hypothesis(
raw_score=0.4,
src_phrase_span=(1, 2),
trg_phrase=('and', 'goodbye'),
previous=child,
)
self.hypothesis_chain = grandchild
def test_translation_so_far(self):
# act
translation = self.hypothesis_chain.translation_so_far()
# assert
self.assertEqual(translation, ['hello', 'world', 'and', 'goodbye'])
def test_translation_so_far_for_empty_hypothesis(self):
# arrange
hypothesis = _Hypothesis()
# act
translation = hypothesis.translation_so_far()
# assert
self.assertEqual(translation, [])
def test_total_translated_words(self):
# act
total_translated_words = self.hypothesis_chain.total_translated_words()
# assert
self.assertEqual(total_translated_words, 5)
def test_translated_positions(self):
# act
translated_positions = self.hypothesis_chain.translated_positions()
# assert
translated_positions.sort()
self.assertEqual(translated_positions, [1, 3, 4, 5, 6])
def test_untranslated_spans(self):
# act
untranslated_spans = self.hypothesis_chain.untranslated_spans(10)
# assert
self.assertEqual(untranslated_spans, [(0, 1), (2, 3), (7, 10)])
def test_untranslated_spans_for_empty_hypothesis(self):
# arrange
hypothesis = _Hypothesis()
# act
untranslated_spans = hypothesis.untranslated_spans(10)
# assert
self.assertEqual(untranslated_spans, [(0, 10)])
class TestStack(unittest.TestCase):
def test_push_bumps_off_worst_hypothesis_when_stack_is_full(self):
# arrange
stack = _Stack(3)
poor_hypothesis = _Hypothesis(0.01)
# act
stack.push(_Hypothesis(0.2))
stack.push(poor_hypothesis)
stack.push(_Hypothesis(0.1))
stack.push(_Hypothesis(0.3))
# assert
self.assertFalse(poor_hypothesis in stack)
def test_push_removes_hypotheses_that_fall_below_beam_threshold(self):
# arrange
stack = _Stack(3, 0.5)
poor_hypothesis = _Hypothesis(0.01)
worse_hypothesis = _Hypothesis(0.009)
# act
stack.push(poor_hypothesis)
stack.push(worse_hypothesis)
stack.push(_Hypothesis(0.9)) # greatly superior hypothesis
# assert
self.assertFalse(poor_hypothesis in stack)
self.assertFalse(worse_hypothesis in stack)
def test_push_does_not_add_hypothesis_that_falls_below_beam_threshold(self):
# arrange
stack = _Stack(3, 0.5)
poor_hypothesis = _Hypothesis(0.01)
# act
stack.push(_Hypothesis(0.9)) # greatly superior hypothesis
stack.push(poor_hypothesis)
# assert
self.assertFalse(poor_hypothesis in stack)
def test_best_returns_the_best_hypothesis(self):
# arrange
stack = _Stack(3)
best_hypothesis = _Hypothesis(0.99)
# act
stack.push(_Hypothesis(0.0))
stack.push(best_hypothesis)
stack.push(_Hypothesis(0.5))
# assert
self.assertEqual(stack.best(), best_hypothesis)
def test_best_returns_none_when_stack_is_empty(self):
# arrange
stack = _Stack(3)
# assert
self.assertEqual(stack.best(), None)