Initial commit

2019-10-20 13:16:49 +02:00
commit 233066caf4
2099 changed files with 360824 additions and 0 deletions
--- a/venv/lib/python3.7/site-packages/nltk/tokenize/stanford_segmenter.py
+++ b/venv/lib/python3.7/site-packages/nltk/tokenize/stanford_segmenter.py
@@ -0,0 +1,314 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Natural Language Toolkit: Interface to the Stanford Segmenter
+# for Chinese and Arabic
+#
+# Copyright (C) 2001-2019 NLTK Project
+# Author: 52nlp <52nlpcn@gmail.com>
+#         Casper Lehmann-Strøm <casperlehmann@gmail.com>
+#         Alex Constantin <alex@keyworder.ch>
+#
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+
+from __future__ import unicode_literals, print_function
+
+import tempfile
+import os
+import json
+import warnings
+from subprocess import PIPE
+
+from six import text_type
+
+from nltk import compat
+from nltk.internals import (
+    find_jar,
+    find_file,
+    find_dir,
+    config_java,
+    java,
+    _java_options,
+)
+from nltk.tokenize.api import TokenizerI
+
+
+_stanford_url = 'https://nlp.stanford.edu/software'
+
+
+class StanfordSegmenter(TokenizerI):
+    """Interface to the Stanford Segmenter
+
+    If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j
+    should be provieded, for example::
+
+        seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')
+
+    >>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
+    >>> seg = StanfordSegmenter()
+    >>> seg.default_config('zh')
+    >>> sent = u'这是斯坦福中文分词器测试'
+    >>> print(seg.segment(sent))
+    \u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5
+    <BLANKLINE>
+    >>> seg.default_config('ar')
+    >>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات'
+    >>> print(seg.segment(sent.split()))
+    \u0647\u0630\u0627 \u0647\u0648 \u062a\u0635\u0646\u064a\u0641 \u0633\u062a\u0627\u0646\u0641\u0648\u0631\u062f \u0627\u0644\u0639\u0631\u0628\u064a \u0644 \u0627\u0644\u0643\u0644\u0645\u0627\u062a
+    <BLANKLINE>
+    """
+
+    _JAR = 'stanford-segmenter.jar'
+
+    def __init__(
+        self,
+        path_to_jar=None,
+        path_to_slf4j=None,
+        java_class=None,
+        path_to_model=None,
+        path_to_dict=None,
+        path_to_sihan_corpora_dict=None,
+        sihan_post_processing='false',
+        keep_whitespaces='false',
+        encoding='UTF-8',
+        options=None,
+        verbose=False,
+        java_options='-mx2g',
+    ):
+        # Raise deprecation warning.
+        warnings.simplefilter('always', DeprecationWarning)
+        warnings.warn(
+            str(
+                "\nThe StanfordTokenizer will "
+                "be deprecated in version 3.2.5.\n"
+                "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"
+            ),
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        warnings.simplefilter('ignore', DeprecationWarning)
+
+        stanford_segmenter = find_jar(
+            self._JAR,
+            path_to_jar,
+            env_vars=('STANFORD_SEGMENTER',),
+            searchpath=(),
+            url=_stanford_url,
+            verbose=verbose,
+        )
+        if path_to_slf4j is not None:
+            slf4j = find_jar(
+                'slf4j-api.jar',
+                path_to_slf4j,
+                env_vars=('SLF4J', 'STANFORD_SEGMENTER'),
+                searchpath=(),
+                url=_stanford_url,
+                verbose=verbose,
+            )
+        else:
+            slf4j = None
+
+        # This is passed to java as the -cp option, the old version of segmenter needs slf4j.
+        # The new version of stanford-segmenter-2016-10-31 doesn't need slf4j
+        self._stanford_jar = os.pathsep.join(
+            _ for _ in [stanford_segmenter, slf4j] if _ is not None
+        )
+
+        self._java_class = java_class
+        self._model = path_to_model
+        self._sihan_corpora_dict = path_to_sihan_corpora_dict
+        self._sihan_post_processing = sihan_post_processing
+        self._keep_whitespaces = keep_whitespaces
+        self._dict = path_to_dict
+
+        self._encoding = encoding
+        self.java_options = java_options
+        options = {} if options is None else options
+        self._options_cmd = ','.join(
+            '{0}={1}'.format(key, json.dumps(val)) for key, val in options.items()
+        )
+
+    def default_config(self, lang):
+        """
+        Attempt to intialize Stanford Word Segmenter for the specified language
+        using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
+        """
+
+        search_path = ()
+        if os.environ.get('STANFORD_SEGMENTER'):
+            search_path = {os.path.join(os.environ.get('STANFORD_SEGMENTER'), 'data')}
+
+        # init for Chinese-specific files
+        self._dict = None
+        self._sihan_corpora_dict = None
+        self._sihan_post_processing = 'false'
+
+        if lang == 'ar':
+            self._java_class = (
+                'edu.stanford.nlp.international.arabic.process.ArabicSegmenter'
+            )
+            model = 'arabic-segmenter-atb+bn+arztrain.ser.gz'
+
+        elif lang == 'zh':
+            self._java_class = 'edu.stanford.nlp.ie.crf.CRFClassifier'
+            model = 'pku.gz'
+            self._sihan_post_processing = 'true'
+
+            path_to_dict = 'dict-chris6.ser.gz'
+            try:
+                self._dict = find_file(
+                    path_to_dict,
+                    searchpath=search_path,
+                    url=_stanford_url,
+                    verbose=False,
+                    env_vars=('STANFORD_MODELS',),
+                )
+            except LookupError:
+                raise LookupError(
+                    "Could not find '%s' (tried using env. "
+                    "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)"
+                    % path_to_dict
+                )
+
+            sihan_dir = './data/'
+            try:
+                path_to_sihan_dir = find_dir(
+                    sihan_dir,
+                    url=_stanford_url,
+                    verbose=False,
+                    env_vars=('STANFORD_SEGMENTER',),
+                )
+                self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
+            except LookupError:
+                raise LookupError(
+                    "Could not find '%s' (tried using the "
+                    "STANFORD_SEGMENTER environment variable)" % sihan_dir
+                )
+        else:
+            raise LookupError("Unsupported language {}".format(lang))
+
+        try:
+            self._model = find_file(
+                model,
+                searchpath=search_path,
+                url=_stanford_url,
+                verbose=False,
+                env_vars=('STANFORD_MODELS', 'STANFORD_SEGMENTER'),
+            )
+        except LookupError:
+            raise LookupError(
+                "Could not find '%s' (tried using env. "
+                "variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model
+            )
+
+    def tokenize(self, s):
+        super().tokenize(s)
+
+    def segment_file(self, input_file_path):
+        """
+        """
+        cmd = [
+            self._java_class,
+            '-loadClassifier',
+            self._model,
+            '-keepAllWhitespaces',
+            self._keep_whitespaces,
+            '-textFile',
+            input_file_path,
+        ]
+        if self._sihan_corpora_dict is not None:
+            cmd.extend(
+                [
+                    '-serDictionary',
+                    self._dict,
+                    '-sighanCorporaDict',
+                    self._sihan_corpora_dict,
+                    '-sighanPostProcessing',
+                    self._sihan_post_processing,
+                ]
+            )
+
+        stdout = self._execute(cmd)
+
+        return stdout
+
+    def segment(self, tokens):
+        return self.segment_sents([tokens])
+
+    def segment_sents(self, sentences):
+        """
+        """
+        encoding = self._encoding
+        # Create a temporary input file
+        _input_fh, self._input_file_path = tempfile.mkstemp(text=True)
+
+        # Write the actural sentences to the temporary input file
+        _input_fh = os.fdopen(_input_fh, 'wb')
+        _input = '\n'.join((' '.join(x) for x in sentences))
+        if isinstance(_input, text_type) and encoding:
+            _input = _input.encode(encoding)
+        _input_fh.write(_input)
+        _input_fh.close()
+
+        cmd = [
+            self._java_class,
+            '-loadClassifier',
+            self._model,
+            '-keepAllWhitespaces',
+            self._keep_whitespaces,
+            '-textFile',
+            self._input_file_path,
+        ]
+        if self._sihan_corpora_dict is not None:
+            cmd.extend(
+                [
+                    '-serDictionary',
+                    self._dict,
+                    '-sighanCorporaDict',
+                    self._sihan_corpora_dict,
+                    '-sighanPostProcessing',
+                    self._sihan_post_processing,
+                ]
+            )
+
+        stdout = self._execute(cmd)
+
+        # Delete the temporary file
+        os.unlink(self._input_file_path)
+
+        return stdout
+
+    def _execute(self, cmd, verbose=False):
+        encoding = self._encoding
+        cmd.extend(['-inputEncoding', encoding])
+        _options_cmd = self._options_cmd
+        if _options_cmd:
+            cmd.extend(['-options', self._options_cmd])
+
+        default_options = ' '.join(_java_options)
+
+        # Configure java.
+        config_java(options=self.java_options, verbose=verbose)
+
+        stdout, _stderr = java(
+            cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
+        )
+        stdout = stdout.decode(encoding)
+
+        # Return java configurations to their default values.
+        config_java(options=default_options, verbose=False)
+
+        return stdout
+
+
+def setup_module(module):
+    from nose import SkipTest
+
+    try:
+        seg = StanfordSegmenter()
+        seg.default_config('ar')
+        seg.default_config('zh')
+    except LookupError as e:
+        raise SkipTest(
+            'Tests for nltk.tokenize.stanford_segmenter skipped: %s' % str(e)
+        )