Initial commit

This commit is contained in:
Senad Uka
2019-10-20 13:16:49 +02:00
commit 233066caf4
2099 changed files with 360824 additions and 0 deletions

View File

@@ -0,0 +1,135 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language Model Unit Tests
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import unittest
import six
from nltk import FreqDist
from nltk.lm import NgramCounter
from nltk.util import everygrams
class NgramCounterTests(unittest.TestCase):
"""Tests for NgramCounter that only involve lookup, no modification."""
@classmethod
def setUpClass(cls):
text = [list("abcd"), list("egdbe")]
cls.trigram_counter = NgramCounter(
(everygrams(sent, max_len=3) for sent in text)
)
cls.bigram_counter = NgramCounter(
(everygrams(sent, max_len=2) for sent in text)
)
def test_N(self):
self.assertEqual(self.bigram_counter.N(), 16)
self.assertEqual(self.trigram_counter.N(), 21)
def test_counter_len_changes_with_lookup(self):
self.assertEqual(len(self.bigram_counter), 2)
_ = self.bigram_counter[50]
self.assertEqual(len(self.bigram_counter), 3)
def test_ngram_order_access_unigrams(self):
self.assertEqual(self.bigram_counter[1], self.bigram_counter.unigrams)
def test_ngram_conditional_freqdist(self):
expected_trigram_contexts = [
("a", "b"),
("b", "c"),
("e", "g"),
("g", "d"),
("d", "b"),
]
expected_bigram_contexts = [("a",), ("b",), ("d",), ("e",), ("c",), ("g",)]
bigrams = self.trigram_counter[2]
trigrams = self.trigram_counter[3]
six.assertCountEqual(self, expected_bigram_contexts, bigrams.conditions())
six.assertCountEqual(self, expected_trigram_contexts, trigrams.conditions())
def test_bigram_counts_seen_ngrams(self):
b_given_a_count = 1
unk_given_b_count = 1
self.assertEqual(b_given_a_count, self.bigram_counter[["a"]]["b"])
self.assertEqual(unk_given_b_count, self.bigram_counter[["b"]]["c"])
def test_bigram_counts_unseen_ngrams(self):
z_given_b_count = 0
self.assertEqual(z_given_b_count, self.bigram_counter[["b"]]["z"])
def test_unigram_counts_seen_words(self):
expected_count_b = 2
self.assertEqual(expected_count_b, self.bigram_counter["b"])
def test_unigram_counts_completely_unseen_words(self):
unseen_count = 0
self.assertEqual(unseen_count, self.bigram_counter["z"])
class NgramCounterTrainingTests(unittest.TestCase):
def setUp(self):
self.counter = NgramCounter()
def test_empty_string(self):
test = NgramCounter("")
self.assertNotIn(2, test)
self.assertEqual(test[1], FreqDist())
def test_empty_list(self):
test = NgramCounter([])
self.assertNotIn(2, test)
self.assertEqual(test[1], FreqDist())
def test_None(self):
test = NgramCounter(None)
self.assertNotIn(2, test)
self.assertEqual(test[1], FreqDist())
def test_train_on_unigrams(self):
words = list("abcd")
counter = NgramCounter([[(w,) for w in words]])
self.assertFalse(bool(counter[3]))
self.assertFalse(bool(counter[2]))
six.assertCountEqual(self, words, counter[1].keys())
def test_train_on_illegal_sentences(self):
str_sent = ["Check", "this", "out", "!"]
list_sent = [["Check", "this"], ["this", "out"], ["out", "!"]]
with self.assertRaises(TypeError):
NgramCounter([str_sent])
with self.assertRaises(TypeError):
NgramCounter([list_sent])
def test_train_on_bigrams(self):
bigram_sent = [("a", "b"), ("c", "d")]
counter = NgramCounter([bigram_sent])
self.assertFalse(bool(counter[3]))
def test_train_on_mix(self):
mixed_sent = [("a", "b"), ("c", "d"), ("e", "f", "g"), ("h",)]
counter = NgramCounter([mixed_sent])
unigrams = ["h"]
bigram_contexts = [("a",), ("c",)]
trigram_contexts = [("e", "f")]
six.assertCountEqual(self, unigrams, counter[1].keys())
six.assertCountEqual(self, bigram_contexts, counter[2].keys())
six.assertCountEqual(self, trigram_contexts, counter[3].keys())

View File

@@ -0,0 +1,446 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language Model Unit Tests
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import division
import math
import sys
import unittest
from six import add_metaclass
from nltk.lm import (
Vocabulary,
MLE,
Lidstone,
Laplace,
WittenBellInterpolated,
KneserNeyInterpolated,
)
from nltk.lm.preprocessing import padded_everygrams
def _prepare_test_data(ngram_order):
return (
Vocabulary(["a", "b", "c", "d", "z", "<s>", "</s>"], unk_cutoff=1),
[
list(padded_everygrams(ngram_order, sent))
for sent in (list("abcd"), list("egadbe"))
],
)
class ParametrizeTestsMeta(type):
"""Metaclass for generating parametrized tests."""
def __new__(cls, name, bases, dct):
contexts = (
("a",),
("c",),
(u"<s>",),
("b",),
(u"<UNK>",),
("d",),
("e",),
("r",),
("w",),
)
for i, c in enumerate(contexts):
dct["test_sumto1_{0}".format(i)] = cls.add_sum_to_1_test(c)
scores = dct.get("score_tests", [])
for i, (word, context, expected_score) in enumerate(scores):
dct["test_score_{0}".format(i)] = cls.add_score_test(
word, context, expected_score
)
return super(ParametrizeTestsMeta, cls).__new__(cls, name, bases, dct)
@classmethod
def add_score_test(cls, word, context, expected_score):
if sys.version_info > (3, 5):
message = "word='{word}', context={context}"
else:
# Python 2 doesn't report the mismatched values if we pass a custom
# message, so we have to report them manually.
message = (
"{score} != {expected_score} within 4 places, "
"word='{word}', context={context}"
)
def test_method(self):
score = self.model.score(word, context)
self.assertAlmostEqual(
score, expected_score, msg=message.format(**locals()), places=4
)
return test_method
@classmethod
def add_sum_to_1_test(cls, context):
def test(self):
s = sum(self.model.score(w, context) for w in self.model.vocab)
self.assertAlmostEqual(s, 1.0, msg="The context is {}".format(context))
return test
@add_metaclass(ParametrizeTestsMeta)
class MleBigramTests(unittest.TestCase):
"""unit tests for MLENgramModel class"""
score_tests = [
("d", ["c"], 1),
# Unseen ngrams should yield 0
("d", ["e"], 0),
# Unigrams should also be 0
("z", None, 0),
# N unigrams = 14
# count('a') = 2
("a", None, 2.0 / 14),
# count('y') = 3
("y", None, 3.0 / 14),
]
def setUp(self):
vocab, training_text = _prepare_test_data(2)
self.model = MLE(2, vocabulary=vocab)
self.model.fit(training_text)
def test_logscore_zero_score(self):
# logscore of unseen ngrams should be -inf
logscore = self.model.logscore("d", ["e"])
self.assertTrue(math.isinf(logscore))
def test_entropy_perplexity_seen(self):
# ngrams seen during training
trained = [
("<s>", "a"),
("a", "b"),
("b", "<UNK>"),
("<UNK>", "a"),
("a", "d"),
("d", "</s>"),
]
# Ngram = Log score
# <s>, a = -1
# a, b = -1
# b, UNK = -1
# UNK, a = -1.585
# a, d = -1
# d, </s> = -1
# TOTAL logscores = -6.585
# - AVG logscores = 1.0975
H = 1.0975
perplexity = 2.1398
self.assertAlmostEqual(H, self.model.entropy(trained), places=4)
self.assertAlmostEqual(perplexity, self.model.perplexity(trained), places=4)
def test_entropy_perplexity_unseen(self):
# In MLE, even one unseen ngram should make entropy and perplexity infinite
untrained = [("<s>", "a"), ("a", "c"), ("c", "d"), ("d", "</s>")]
self.assertTrue(math.isinf(self.model.entropy(untrained)))
self.assertTrue(math.isinf(self.model.perplexity(untrained)))
def test_entropy_perplexity_unigrams(self):
# word = score, log score
# <s> = 0.1429, -2.8074
# a = 0.1429, -2.8074
# c = 0.0714, -3.8073
# UNK = 0.2143, -2.2224
# d = 0.1429, -2.8074
# c = 0.0714, -3.8073
# </s> = 0.1429, -2.8074
# TOTAL logscores = -21.6243
# - AVG logscores = 3.0095
H = 3.0095
perplexity = 8.0529
text = [("<s>",), ("a",), ("c",), ("-",), ("d",), ("c",), ("</s>",)]
self.assertAlmostEqual(H, self.model.entropy(text), places=4)
self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
@add_metaclass(ParametrizeTestsMeta)
class MleTrigramTests(unittest.TestCase):
"""MLE trigram model tests"""
score_tests = [
# count(d | b, c) = 1
# count(b, c) = 1
("d", ("b", "c"), 1),
# count(d | c) = 1
# count(c) = 1
("d", ["c"], 1),
# total number of tokens is 18, of which "a" occured 2 times
("a", None, 2.0 / 18),
# in vocabulary but unseen
("z", None, 0),
# out of vocabulary should use "UNK" score
("y", None, 3.0 / 18),
]
def setUp(self):
vocab, training_text = _prepare_test_data(3)
self.model = MLE(3, vocabulary=vocab)
self.model.fit(training_text)
@add_metaclass(ParametrizeTestsMeta)
class LidstoneBigramTests(unittest.TestCase):
"""unit tests for Lidstone class"""
score_tests = [
# count(d | c) = 1
# *count(d | c) = 1.1
# Count(w | c for w in vocab) = 1
# *Count(w | c for w in vocab) = 1.8
("d", ["c"], 1.1 / 1.8),
# Total unigrams: 14
# Vocab size: 8
# Denominator: 14 + 0.8 = 14.8
# count("a") = 2
# *count("a") = 2.1
("a", None, 2.1 / 14.8),
# in vocabulary but unseen
# count("z") = 0
# *count("z") = 0.1
("z", None, 0.1 / 14.8),
# out of vocabulary should use "UNK" score
# count("<UNK>") = 3
# *count("<UNK>") = 3.1
("y", None, 3.1 / 14.8),
]
def setUp(self):
vocab, training_text = _prepare_test_data(2)
self.model = Lidstone(0.1, 2, vocabulary=vocab)
self.model.fit(training_text)
def test_gamma(self):
self.assertEqual(0.1, self.model.gamma)
def test_entropy_perplexity(self):
text = [
("<s>", "a"),
("a", "c"),
("c", "<UNK>"),
("<UNK>", "d"),
("d", "c"),
("c", "</s>"),
]
# Unlike MLE this should be able to handle completely novel ngrams
# Ngram = score, log score
# <s>, a = 0.3929, -1.3479
# a, c = 0.0357, -4.8074
# c, UNK = 0.0(5), -4.1699
# UNK, d = 0.0263, -5.2479
# d, c = 0.0357, -4.8074
# c, </s> = 0.0(5), -4.1699
# TOTAL logscore: 24.5504
# - AVG logscore: 4.0917
H = 4.0917
perplexity = 17.0504
self.assertAlmostEqual(H, self.model.entropy(text), places=4)
self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
@add_metaclass(ParametrizeTestsMeta)
class LidstoneTrigramTests(unittest.TestCase):
score_tests = [
# Logic behind this is the same as for bigram model
("d", ["c"], 1.1 / 1.8),
# if we choose a word that hasn't appeared after (b, c)
("e", ["c"], 0.1 / 1.8),
# Trigram score now
("d", ["b", "c"], 1.1 / 1.8),
("e", ["b", "c"], 0.1 / 1.8),
]
def setUp(self):
vocab, training_text = _prepare_test_data(3)
self.model = Lidstone(0.1, 3, vocabulary=vocab)
self.model.fit(training_text)
@add_metaclass(ParametrizeTestsMeta)
class LaplaceBigramTests(unittest.TestCase):
"""unit tests for Laplace class"""
score_tests = [
# basic sanity-check:
# count(d | c) = 1
# *count(d | c) = 2
# Count(w | c for w in vocab) = 1
# *Count(w | c for w in vocab) = 9
("d", ["c"], 2.0 / 9),
# Total unigrams: 14
# Vocab size: 8
# Denominator: 14 + 8 = 22
# count("a") = 2
# *count("a") = 3
("a", None, 3.0 / 22),
# in vocabulary but unseen
# count("z") = 0
# *count("z") = 1
("z", None, 1.0 / 22),
# out of vocabulary should use "UNK" score
# count("<UNK>") = 3
# *count("<UNK>") = 4
("y", None, 4.0 / 22),
]
def setUp(self):
vocab, training_text = _prepare_test_data(2)
self.model = Laplace(2, vocabulary=vocab)
self.model.fit(training_text)
def test_gamma(self):
# Make sure the gamma is set to 1
self.assertEqual(1, self.model.gamma)
def test_entropy_perplexity(self):
text = [
("<s>", "a"),
("a", "c"),
("c", "<UNK>"),
("<UNK>", "d"),
("d", "c"),
("c", "</s>"),
]
# Unlike MLE this should be able to handle completely novel ngrams
# Ngram = score, log score
# <s>, a = 0.2, -2.3219
# a, c = 0.1, -3.3219
# c, UNK = 0.(1), -3.1699
# UNK, d = 0.(09), 3.4594
# d, c = 0.1 -3.3219
# c, </s> = 0.(1), -3.1699
# Total logscores: 18.7651
# - AVG logscores: 3.1275
H = 3.1275
perplexity = 8.7393
self.assertAlmostEqual(H, self.model.entropy(text), places=4)
self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
@add_metaclass(ParametrizeTestsMeta)
class WittenBellInterpolatedTrigramTests(unittest.TestCase):
def setUp(self):
vocab, training_text = _prepare_test_data(3)
self.model = WittenBellInterpolated(3, vocabulary=vocab)
self.model.fit(training_text)
score_tests = [
# For unigram scores by default revert to MLE
# Total unigrams: 18
# count('c'): 1
("c", None, 1.0 / 18),
# in vocabulary but unseen
# count("z") = 0
("z", None, 0.0 / 18),
# out of vocabulary should use "UNK" score
# count("<UNK>") = 3
("y", None, 3.0 / 18),
# gamma(['b']) = 0.1111
# mle.score('c', ['b']) = 0.5
# (1 - gamma) * mle + gamma * mle('c') ~= 0.45 + .3 / 18
("c", ["b"], (1 - 0.1111) * 0.5 + 0.1111 * 1 / 18),
# building on that, let's try 'a b c' as the trigram
# gamma(['a', 'b']) = 0.0667
# mle("c", ["a", "b"]) = 1
("c", ["a", "b"], (1 - 0.0667) + 0.0667 * ((1 - 0.1111) * 0.5 + 0.1111 / 18)),
]
@add_metaclass(ParametrizeTestsMeta)
class KneserNeyInterpolatedTrigramTests(unittest.TestCase):
def setUp(self):
vocab, training_text = _prepare_test_data(3)
self.model = KneserNeyInterpolated(3, vocabulary=vocab)
self.model.fit(training_text)
score_tests = [
# For unigram scores revert to uniform
# Vocab size: 8
# count('c'): 1
("c", None, 1.0 / 8),
# in vocabulary but unseen, still uses uniform
("z", None, 1 / 8),
# out of vocabulary should use "UNK" score, i.e. again uniform
("y", None, 1.0 / 8),
# alpha = count('bc') - discount = 1 - 0.1 = 0.9
# gamma(['b']) = discount * number of unique words that follow ['b'] = 0.1 * 2
# normalizer = total number of bigrams with this context = 2
# the final should be: (alpha + gamma * unigram_score("c"))
("c", ["b"], (0.9 + 0.2 * (1 / 8)) / 2),
# building on that, let's try 'a b c' as the trigram
# alpha = count('abc') - discount = 1 - 0.1 = 0.9
# gamma(['a', 'b']) = 0.1 * 1
# normalizer = total number of trigrams with prefix "ab" = 1 => we can ignore it!
("c", ["a", "b"], 0.9 + 0.1 * ((0.9 + 0.2 * (1 / 8)) / 2)),
]
class NgramModelTextGenerationTests(unittest.TestCase):
"""Using MLE estimator, generate some text."""
def setUp(self):
vocab, training_text = _prepare_test_data(3)
self.model = MLE(3, vocabulary=vocab)
self.model.fit(training_text)
def test_generate_one_no_context(self):
self.assertEqual(self.model.generate(random_seed=3), "<UNK>")
def test_generate_one_limiting_context(self):
# We don't need random_seed for contexts with only one continuation
self.assertEqual(self.model.generate(text_seed=["c"]), "d")
self.assertEqual(self.model.generate(text_seed=["b", "c"]), "d")
self.assertEqual(self.model.generate(text_seed=["a", "c"]), "d")
def test_generate_one_varied_context(self):
# When context doesn't limit our options enough, seed the random choice
self.assertEqual(
self.model.generate(text_seed=("a", "<s>"), random_seed=2), "a"
)
def test_generate_cycle(self):
# Add a cycle to the model: bd -> b, db -> d
more_training_text = [list(padded_everygrams(self.model.order, list("bdbdbd")))]
self.model.fit(more_training_text)
# Test that we can escape the cycle
self.assertEqual(
self.model.generate(7, text_seed=("b", "d"), random_seed=5),
["b", "d", "b", "d", "b", "d", "</s>"],
)
def test_generate_with_text_seed(self):
self.assertEqual(
self.model.generate(5, text_seed=("<s>", "e"), random_seed=3),
["<UNK>", "a", "d", "b", "<UNK>"],
)
def test_generate_oov_text_seed(self):
self.assertEqual(
self.model.generate(text_seed=("aliens",), random_seed=3),
self.model.generate(text_seed=("<UNK>",), random_seed=3),
)
def test_generate_None_text_seed(self):
# should crash with type error when we try to look it up in vocabulary
with self.assertRaises(TypeError):
self.model.generate(text_seed=(None,))
# This will work
self.assertEqual(
self.model.generate(text_seed=None, random_seed=3),
self.model.generate(random_seed=3),
)

View File

@@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language Model Unit Tests
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import unittest
from nltk.lm.preprocessing import padded_everygram_pipeline
class TestPreprocessing(unittest.TestCase):
def test_padded_everygram_pipeline(self):
expected_train = [
[
("<s>",),
("a",),
("b",),
("c",),
("</s>",),
("<s>", "a"),
("a", "b"),
("b", "c"),
("c", "</s>"),
]
]
expected_vocab = ["<s>", "a", "b", "c", "</s>"]
train_data, vocab_data = padded_everygram_pipeline(2, [["a", "b", "c"]])
self.assertEqual([list(sent) for sent in train_data], expected_train)
self.assertEqual(list(vocab_data), expected_vocab)

View File

@@ -0,0 +1,141 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Language Model Unit Tests
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import unittest
from collections import Counter
import six
from nltk.lm import Vocabulary
class NgramModelVocabularyTests(unittest.TestCase):
"""tests Vocabulary Class"""
@classmethod
def setUpClass(cls):
cls.vocab = Vocabulary(
["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"],
unk_cutoff=2,
)
def test_truthiness(self):
self.assertTrue(self.vocab)
def test_cutoff_value_set_correctly(self):
self.assertEqual(self.vocab.cutoff, 2)
def test_unable_to_change_cutoff(self):
with self.assertRaises(AttributeError):
self.vocab.cutoff = 3
def test_cutoff_setter_checks_value(self):
with self.assertRaises(ValueError) as exc_info:
Vocabulary("abc", unk_cutoff=0)
expected_error_msg = "Cutoff value cannot be less than 1. Got: 0"
self.assertEqual(expected_error_msg, str(exc_info.exception))
def test_counts_set_correctly(self):
self.assertEqual(self.vocab.counts["a"], 2)
self.assertEqual(self.vocab.counts["b"], 2)
self.assertEqual(self.vocab.counts["c"], 1)
def test_membership_check_respects_cutoff(self):
# a was seen 2 times, so it should be considered part of the vocabulary
self.assertTrue("a" in self.vocab)
# "c" was seen once, it shouldn't be considered part of the vocab
self.assertFalse("c" in self.vocab)
# "z" was never seen at all, also shouldn't be considered in the vocab
self.assertFalse("z" in self.vocab)
def test_vocab_len_respects_cutoff(self):
# Vocab size is the number of unique tokens that occur at least as often
# as the cutoff value, plus 1 to account for unknown words.
self.assertEqual(5, len(self.vocab))
def test_vocab_iter_respects_cutoff(self):
vocab_counts = ["a", "b", "c", "d", "e", "f", "g", "w", "z"]
vocab_items = ["a", "b", "d", "e", "<UNK>"]
six.assertCountEqual(self, vocab_counts, list(self.vocab.counts.keys()))
six.assertCountEqual(self, vocab_items, list(self.vocab))
def test_update_empty_vocab(self):
empty = Vocabulary(unk_cutoff=2)
self.assertEqual(len(empty), 0)
self.assertFalse(empty)
self.assertIn(empty.unk_label, empty)
empty.update(list("abcde"))
self.assertIn(empty.unk_label, empty)
def test_lookup(self):
self.assertEqual(self.vocab.lookup("a"), "a")
self.assertEqual(self.vocab.lookup("c"), "<UNK>")
def test_lookup_iterables(self):
self.assertEqual(self.vocab.lookup(["a", "b"]), ("a", "b"))
self.assertEqual(self.vocab.lookup(("a", "b")), ("a", "b"))
self.assertEqual(self.vocab.lookup(("a", "c")), ("a", "<UNK>"))
self.assertEqual(
self.vocab.lookup(map(str, range(3))), ("<UNK>", "<UNK>", "<UNK>")
)
def test_lookup_empty_iterables(self):
self.assertEqual(self.vocab.lookup(()), ())
self.assertEqual(self.vocab.lookup([]), ())
self.assertEqual(self.vocab.lookup(iter([])), ())
self.assertEqual(self.vocab.lookup(n for n in range(0, 0)), ())
def test_lookup_recursive(self):
self.assertEqual(
self.vocab.lookup([["a", "b"], ["a", "c"]]), (("a", "b"), ("a", "<UNK>"))
)
self.assertEqual(self.vocab.lookup([["a", "b"], "c"]), (("a", "b"), "<UNK>"))
self.assertEqual(self.vocab.lookup([[[[["a", "b"]]]]]), ((((("a", "b"),),),),))
def test_lookup_None(self):
with self.assertRaises(TypeError):
self.vocab.lookup(None)
with self.assertRaises(TypeError):
list(self.vocab.lookup([None, None]))
def test_lookup_int(self):
with self.assertRaises(TypeError):
self.vocab.lookup(1)
with self.assertRaises(TypeError):
list(self.vocab.lookup([1, 2]))
def test_lookup_empty_str(self):
self.assertEqual(self.vocab.lookup(""), "<UNK>")
def test_eqality(self):
v1 = Vocabulary(["a", "b", "c"], unk_cutoff=1)
v2 = Vocabulary(["a", "b", "c"], unk_cutoff=1)
v3 = Vocabulary(["a", "b", "c"], unk_cutoff=1, unk_label="blah")
v4 = Vocabulary(["a", "b"], unk_cutoff=1)
self.assertEqual(v1, v2)
self.assertNotEqual(v1, v3)
self.assertNotEqual(v1, v4)
def test_str(self):
self.assertEqual(
str(self.vocab),
("<Vocabulary with cutoff=2 " "unk_label='<UNK>' and 5 items>"),
)
def test_creation_with_counter(self):
self.assertEqual(
self.vocab,
Vocabulary(
Counter(
["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"]
),
unk_cutoff=2,
),
)