Initial commit

2019-10-20 13:16:49 +02:00
commit 233066caf4
2099 changed files with 360824 additions and 0 deletions
--- a/venv/lib/python3.7/site-packages/nltk/classify/rte_classify.py
+++ b/venv/lib/python3.7/site-packages/nltk/classify/rte_classify.py
@@ -0,0 +1,180 @@
+# Natural Language Toolkit: RTE Classifier
+#
+# Copyright (C) 2001-2019 NLTK Project
+# Author: Ewan Klein <ewan@inf.ed.ac.uk>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Simple classifier for RTE corpus.
+
+It calculates the overlap in words and named entities between text and
+hypothesis, and also whether there are words / named entities in the
+hypothesis which fail to occur in the text, since this is an indicator that
+the hypothesis is more informative than (i.e not entailed by) the text.
+
+TO DO: better Named Entity classification
+TO DO: add lemmatization
+"""
+from __future__ import print_function
+
+from nltk.tokenize import RegexpTokenizer
+from nltk.classify.util import accuracy, check_megam_config
+from nltk.classify.maxent import MaxentClassifier
+
+
+class RTEFeatureExtractor(object):
+    """
+    This builds a bag of words for both the text and the hypothesis after
+    throwing away some stopwords, then calculates overlap and difference.
+    """
+
+    def __init__(self, rtepair, stop=True, use_lemmatize=False):
+        """
+        :param rtepair: a ``RTEPair`` from which features should be extracted
+        :param stop: if ``True``, stopwords are thrown away.
+        :type stop: bool
+        """
+        self.stop = stop
+        self.stopwords = set(
+            [
+                'a',
+                'the',
+                'it',
+                'they',
+                'of',
+                'in',
+                'to',
+                'is',
+                'have',
+                'are',
+                'were',
+                'and',
+                'very',
+                '.',
+                ',',
+            ]
+        )
+
+        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected', 'denied'])
+        # Try to tokenize so that abbreviations, monetary amounts, email
+        # addresses, URLs are single tokens.
+        tokenizer = RegexpTokenizer('[\w.@:/]+|\w+|\$[\d.]+')
+
+        # Get the set of word types for text and hypothesis
+        self.text_tokens = tokenizer.tokenize(rtepair.text)
+        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
+        self.text_words = set(self.text_tokens)
+        self.hyp_words = set(self.hyp_tokens)
+
+        if use_lemmatize:
+            self.text_words = set(self._lemmatize(token) for token in self.text_tokens)
+            self.hyp_words = set(self._lemmatize(token) for token in self.hyp_tokens)
+
+        if self.stop:
+            self.text_words = self.text_words - self.stopwords
+            self.hyp_words = self.hyp_words - self.stopwords
+
+        self._overlap = self.hyp_words & self.text_words
+        self._hyp_extra = self.hyp_words - self.text_words
+        self._txt_extra = self.text_words - self.hyp_words
+
+    def overlap(self, toktype, debug=False):
+        """
+        Compute the overlap between text and hypothesis.
+
+        :param toktype: distinguish Named Entities from ordinary words
+        :type toktype: 'ne' or 'word'
+        """
+        ne_overlap = set(token for token in self._overlap if self._ne(token))
+        if toktype == 'ne':
+            if debug:
+                print("ne overlap", ne_overlap)
+            return ne_overlap
+        elif toktype == 'word':
+            if debug:
+                print("word overlap", self._overlap - ne_overlap)
+            return self._overlap - ne_overlap
+        else:
+            raise ValueError("Type not recognized:'%s'" % toktype)
+
+    def hyp_extra(self, toktype, debug=True):
+        """
+        Compute the extraneous material in the hypothesis.
+
+        :param toktype: distinguish Named Entities from ordinary words
+        :type toktype: 'ne' or 'word'
+        """
+        ne_extra = set(token for token in self._hyp_extra if self._ne(token))
+        if toktype == 'ne':
+            return ne_extra
+        elif toktype == 'word':
+            return self._hyp_extra - ne_extra
+        else:
+            raise ValueError("Type not recognized: '%s'" % toktype)
+
+    @staticmethod
+    def _ne(token):
+        """
+        This just assumes that words in all caps or titles are
+        named entities.
+
+        :type token: str
+        """
+        if token.istitle() or token.isupper():
+            return True
+        return False
+
+    @staticmethod
+    def _lemmatize(word):
+        """
+        Use morphy from WordNet to find the base form of verbs.
+        """
+        lemma = nltk.corpus.wordnet.morphy(word, pos=nltk.corpus.wordnet.VERB)
+        if lemma is not None:
+            return lemma
+        return word
+
+
+def rte_features(rtepair):
+    extractor = RTEFeatureExtractor(rtepair)
+    features = {}
+    features['alwayson'] = True
+    features['word_overlap'] = len(extractor.overlap('word'))
+    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
+    features['ne_overlap'] = len(extractor.overlap('ne'))
+    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
+    features['neg_txt'] = len(extractor.negwords & extractor.text_words)
+    features['neg_hyp'] = len(extractor.negwords & extractor.hyp_words)
+    return features
+
+
+def rte_featurize(rte_pairs):
+    return [(rte_features(pair), pair.value) for pair in rte_pairs]
+
+
+def rte_classifier(algorithm):
+    from nltk.corpus import rte as rte_corpus
+
+    train_set = rte_corpus.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])
+    test_set = rte_corpus.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])
+    featurized_train_set = rte_featurize(train_set)
+    featurized_test_set = rte_featurize(test_set)
+    # Train the classifier
+    print('Training classifier...')
+    if algorithm in ['megam', 'BFGS']:  # MEGAM based algorithms.
+        # Ensure that MEGAM is configured first.
+        check_megam_config()
+        clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm)
+    elif algorithm in ['GIS', 'IIS']:  # Use default GIS/IIS MaxEnt algorithm
+        clf = MaxentClassifier.train(featurized_train_set, algorithm)
+    else:
+        err_msg = str(
+            "RTEClassifier only supports these algorithms:\n "
+            "'megam', 'BFGS', 'GIS', 'IIS'.\n"
+        )
+        raise Exception(err_msg)
+    print('Testing classifier...')
+    acc = accuracy(clf, featurized_test_set)
+    print('Accuracy: %6.4f' % acc)
+    return clf