Initial commit

2019-10-20 13:16:49 +02:00
commit 233066caf4
2099 changed files with 360824 additions and 0 deletions
--- a/venv/lib/python3.7/site-packages/nltk/test/unit/test_stem.py
+++ b/venv/lib/python3.7/site-packages/nltk/test/unit/test_stem.py
@@ -0,0 +1,146 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function, unicode_literals
+
+import os
+import unittest
+from contextlib import closing
+
+from nltk import data
+from nltk.stem.snowball import SnowballStemmer
+from nltk.stem.porter import PorterStemmer
+
+
+class SnowballTest(unittest.TestCase):
+    def test_arabic(self):
+        """
+        this unit testing for test the snowball arabic light stemmer
+        this stemmer deals with prefixes and suffixes
+        """
+        # Test where the ignore_stopwords=True.
+        ar_stemmer = SnowballStemmer("arabic", True)
+        assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
+        assert ar_stemmer.stem("العربية") == "عرب"
+        assert ar_stemmer.stem("فقالوا") == "قال"
+        assert ar_stemmer.stem("الطالبات") == "طالب"
+        assert ar_stemmer.stem("فالطالبات") == "طالب"
+        assert ar_stemmer.stem("والطالبات") == "طالب"
+        assert ar_stemmer.stem("الطالبون") == "طالب"
+        assert ar_stemmer.stem("اللذان") == "اللذان"
+        assert ar_stemmer.stem("من") == "من"
+        # Test where the ignore_stopwords=False.
+        ar_stemmer = SnowballStemmer("arabic", False)
+        assert ar_stemmer.stem("اللذان") == "اللذ"  # this is a stop word
+        assert ar_stemmer.stem("الطالبات") == "طالب"
+        assert ar_stemmer.stem("الكلمات") == "كلم"
+        # test where create the arabic stemmer without given init value to ignore_stopwords
+        ar_stemmer = SnowballStemmer("arabic")
+        assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
+        assert ar_stemmer.stem("العربية") == "عرب"
+        assert ar_stemmer.stem("فقالوا") == "قال"
+        assert ar_stemmer.stem("الطالبات") == "طالب"
+        assert ar_stemmer.stem("الكلمات") == "كلم"
+
+    def test_russian(self):
+        stemmer_russian = SnowballStemmer("russian")
+        assert stemmer_russian.stem("авантненькая") == "авантненьк"
+
+    def test_german(self):
+        stemmer_german = SnowballStemmer("german")
+        stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)
+
+        assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
+        assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'
+
+        assert stemmer_german.stem("keinen") == 'kein'
+        assert stemmer_german2.stem("keinen") == 'keinen'
+
+    def test_spanish(self):
+        stemmer = SnowballStemmer('spanish')
+
+        assert stemmer.stem("Visionado") == 'vision'
+
+        # The word 'algue' was raising an IndexError
+        assert stemmer.stem("algue") == 'algu'
+
+    def test_short_strings_bug(self):
+        stemmer = SnowballStemmer('english')
+        assert stemmer.stem("y's") == 'y'
+
+
+class PorterTest(unittest.TestCase):
+    def _vocabulary(self):
+        with closing(
+            data.find('stemmers/porter_test/porter_vocabulary.txt').open(
+                encoding='utf-8'
+            )
+        ) as fp:
+            return fp.read().splitlines()
+
+    def _test_against_expected_output(self, stemmer_mode, expected_stems):
+        stemmer = PorterStemmer(mode=stemmer_mode)
+        for word, true_stem in zip(self._vocabulary(), expected_stems):
+            our_stem = stemmer.stem(word)
+            assert our_stem == true_stem, (
+                "%s should stem to %s in %s mode but got %s"
+                % (word, true_stem, stemmer_mode, our_stem)
+            )
+
+    def test_vocabulary_martin_mode(self):
+        """Tests all words from the test vocabulary provided by M Porter
+
+        The sample vocabulary and output were sourced from:
+            http://tartarus.org/martin/PorterStemmer/voc.txt
+            http://tartarus.org/martin/PorterStemmer/output.txt
+        and are linked to from the Porter Stemmer algorithm's homepage
+        at
+            http://tartarus.org/martin/PorterStemmer/
+        """
+        with closing(
+            data.find('stemmers/porter_test/porter_martin_output.txt').open(
+                encoding='utf-8'
+            )
+        ) as fp:
+            self._test_against_expected_output(
+                PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines()
+            )
+
+    def test_vocabulary_nltk_mode(self):
+        with closing(
+            data.find('stemmers/porter_test/porter_nltk_output.txt').open(
+                encoding='utf-8'
+            )
+        ) as fp:
+            self._test_against_expected_output(
+                PorterStemmer.NLTK_EXTENSIONS, fp.read().splitlines()
+            )
+
+    def test_vocabulary_original_mode(self):
+        # The list of stems for this test was generated by taking the
+        # Martin-blessed stemmer from
+        # http://tartarus.org/martin/PorterStemmer/c.txt
+        # and removing all the --DEPARTURE-- sections from it and
+        # running it against Martin's test vocabulary.
+
+        with closing(
+            data.find('stemmers/porter_test/porter_original_output.txt').open(
+                encoding='utf-8'
+            )
+        ) as fp:
+            self._test_against_expected_output(
+                PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines()
+            )
+
+        self._test_against_expected_output(
+            PorterStemmer.ORIGINAL_ALGORITHM,
+            data.find('stemmers/porter_test/porter_original_output.txt')
+            .open(encoding='utf-8')
+            .read()
+            .splitlines(),
+        )
+
+    def test_oed_bug(self):
+        """Test for bug https://github.com/nltk/nltk/issues/1581
+
+        Ensures that 'oed' can be stemmed without throwing an error.
+        """
+        assert PorterStemmer().stem('oed') == 'o'