Initial commit
This commit is contained in:
493
venv/lib/python3.7/site-packages/nltk/corpus/__init__.py
Normal file
493
venv/lib/python3.7/site-packages/nltk/corpus/__init__.py
Normal file
@@ -0,0 +1,493 @@
|
||||
# Natural Language Toolkit: Corpus Readers
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
# TODO this docstring isn't up-to-date!
|
||||
"""
|
||||
NLTK corpus readers. The modules in this package provide functions
|
||||
that can be used to read corpus files in a variety of formats. These
|
||||
functions can be used to read both the corpus files that are
|
||||
distributed in the NLTK corpus package, and corpus files that are part
|
||||
of external corpora.
|
||||
|
||||
Available Corpora
|
||||
=================
|
||||
|
||||
Please see http://www.nltk.org/nltk_data/ for a complete list.
|
||||
Install corpora using nltk.download().
|
||||
|
||||
Corpus Reader Functions
|
||||
=======================
|
||||
Each corpus module defines one or more "corpus reader functions",
|
||||
which can be used to read documents from that corpus. These functions
|
||||
take an argument, ``item``, which is used to indicate which document
|
||||
should be read from the corpus:
|
||||
|
||||
- If ``item`` is one of the unique identifiers listed in the corpus
|
||||
module's ``items`` variable, then the corresponding document will
|
||||
be loaded from the NLTK corpus package.
|
||||
- If ``item`` is a filename, then that file will be read.
|
||||
|
||||
Additionally, corpus reader functions can be given lists of item
|
||||
names; in which case, they will return a concatenation of the
|
||||
corresponding documents.
|
||||
|
||||
Corpus reader functions are named based on the type of information
|
||||
they return. Some common examples, and their return types, are:
|
||||
|
||||
- words(): list of str
|
||||
- sents(): list of (list of str)
|
||||
- paras(): list of (list of (list of str))
|
||||
- tagged_words(): list of (str,str) tuple
|
||||
- tagged_sents(): list of (list of (str,str))
|
||||
- tagged_paras(): list of (list of (list of (str,str)))
|
||||
- chunked_sents(): list of (Tree w/ (str,str) leaves)
|
||||
- parsed_sents(): list of (Tree with str leaves)
|
||||
- parsed_paras(): list of (list of (Tree with str leaves))
|
||||
- xml(): A single xml ElementTree
|
||||
- raw(): unprocessed corpus contents
|
||||
|
||||
For example, to read a list of the words in the Brown Corpus, use
|
||||
``nltk.corpus.brown.words()``:
|
||||
|
||||
>>> from nltk.corpus import brown
|
||||
>>> print(", ".join(brown.words()))
|
||||
The, Fulton, County, Grand, Jury, said, ...
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from nltk.tokenize import RegexpTokenizer
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
from nltk.corpus.reader import *
|
||||
|
||||
abc = LazyCorpusLoader(
|
||||
'abc',
|
||||
PlaintextCorpusReader,
|
||||
r'(?!\.).*\.txt',
|
||||
encoding=[('science', 'latin_1'), ('rural', 'utf8')],
|
||||
)
|
||||
alpino = LazyCorpusLoader('alpino', AlpinoCorpusReader, tagset='alpino')
|
||||
brown = LazyCorpusLoader(
|
||||
'brown',
|
||||
CategorizedTaggedCorpusReader,
|
||||
r'c[a-z]\d\d',
|
||||
cat_file='cats.txt',
|
||||
tagset='brown',
|
||||
encoding="ascii",
|
||||
)
|
||||
cess_cat = LazyCorpusLoader(
|
||||
'cess_cat',
|
||||
BracketParseCorpusReader,
|
||||
r'(?!\.).*\.tbf',
|
||||
tagset='unknown',
|
||||
encoding='ISO-8859-15',
|
||||
)
|
||||
cess_esp = LazyCorpusLoader(
|
||||
'cess_esp',
|
||||
BracketParseCorpusReader,
|
||||
r'(?!\.).*\.tbf',
|
||||
tagset='unknown',
|
||||
encoding='ISO-8859-15',
|
||||
)
|
||||
cmudict = LazyCorpusLoader('cmudict', CMUDictCorpusReader, ['cmudict'])
|
||||
comtrans = LazyCorpusLoader('comtrans', AlignedCorpusReader, r'(?!\.).*\.txt')
|
||||
comparative_sentences = LazyCorpusLoader(
|
||||
'comparative_sentences',
|
||||
ComparativeSentencesCorpusReader,
|
||||
r'labeledSentences\.txt',
|
||||
encoding='latin-1',
|
||||
)
|
||||
conll2000 = LazyCorpusLoader(
|
||||
'conll2000',
|
||||
ConllChunkCorpusReader,
|
||||
['train.txt', 'test.txt'],
|
||||
('NP', 'VP', 'PP'),
|
||||
tagset='wsj',
|
||||
encoding='ascii',
|
||||
)
|
||||
conll2002 = LazyCorpusLoader(
|
||||
'conll2002',
|
||||
ConllChunkCorpusReader,
|
||||
'.*\.(test|train).*',
|
||||
('LOC', 'PER', 'ORG', 'MISC'),
|
||||
encoding='utf-8',
|
||||
)
|
||||
conll2007 = LazyCorpusLoader(
|
||||
'conll2007',
|
||||
DependencyCorpusReader,
|
||||
'.*\.(test|train).*',
|
||||
encoding=[('eus', 'ISO-8859-2'), ('esp', 'utf8')],
|
||||
)
|
||||
crubadan = LazyCorpusLoader('crubadan', CrubadanCorpusReader, '.*\.txt')
|
||||
dependency_treebank = LazyCorpusLoader(
|
||||
'dependency_treebank', DependencyCorpusReader, '.*\.dp', encoding='ascii'
|
||||
)
|
||||
floresta = LazyCorpusLoader(
|
||||
'floresta',
|
||||
BracketParseCorpusReader,
|
||||
r'(?!\.).*\.ptb',
|
||||
'#',
|
||||
tagset='unknown',
|
||||
encoding='ISO-8859-15',
|
||||
)
|
||||
framenet15 = LazyCorpusLoader(
|
||||
'framenet_v15',
|
||||
FramenetCorpusReader,
|
||||
[
|
||||
'frRelation.xml',
|
||||
'frameIndex.xml',
|
||||
'fulltextIndex.xml',
|
||||
'luIndex.xml',
|
||||
'semTypes.xml',
|
||||
],
|
||||
)
|
||||
framenet = LazyCorpusLoader(
|
||||
'framenet_v17',
|
||||
FramenetCorpusReader,
|
||||
[
|
||||
'frRelation.xml',
|
||||
'frameIndex.xml',
|
||||
'fulltextIndex.xml',
|
||||
'luIndex.xml',
|
||||
'semTypes.xml',
|
||||
],
|
||||
)
|
||||
gazetteers = LazyCorpusLoader(
|
||||
'gazetteers', WordListCorpusReader, r'(?!LICENSE|\.).*\.txt', encoding='ISO-8859-2'
|
||||
)
|
||||
genesis = LazyCorpusLoader(
|
||||
'genesis',
|
||||
PlaintextCorpusReader,
|
||||
r'(?!\.).*\.txt',
|
||||
encoding=[
|
||||
('finnish|french|german', 'latin_1'),
|
||||
('swedish', 'cp865'),
|
||||
('.*', 'utf_8'),
|
||||
],
|
||||
)
|
||||
gutenberg = LazyCorpusLoader(
|
||||
'gutenberg', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1'
|
||||
)
|
||||
ieer = LazyCorpusLoader('ieer', IEERCorpusReader, r'(?!README|\.).*')
|
||||
inaugural = LazyCorpusLoader(
|
||||
'inaugural', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1'
|
||||
)
|
||||
# [XX] This should probably just use TaggedCorpusReader:
|
||||
indian = LazyCorpusLoader(
|
||||
'indian', IndianCorpusReader, r'(?!\.).*\.pos', tagset='unknown', encoding='utf8'
|
||||
)
|
||||
|
||||
jeita = LazyCorpusLoader('jeita', ChasenCorpusReader, r'.*\.chasen', encoding='utf-8')
|
||||
knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
|
||||
lin_thesaurus = LazyCorpusLoader('lin_thesaurus', LinThesaurusCorpusReader, r'.*\.lsp')
|
||||
mac_morpho = LazyCorpusLoader(
|
||||
'mac_morpho',
|
||||
MacMorphoCorpusReader,
|
||||
r'(?!\.).*\.txt',
|
||||
tagset='unknown',
|
||||
encoding='latin-1',
|
||||
)
|
||||
machado = LazyCorpusLoader(
|
||||
'machado',
|
||||
PortugueseCategorizedPlaintextCorpusReader,
|
||||
r'(?!\.).*\.txt',
|
||||
cat_pattern=r'([a-z]*)/.*',
|
||||
encoding='latin-1',
|
||||
)
|
||||
masc_tagged = LazyCorpusLoader(
|
||||
'masc_tagged',
|
||||
CategorizedTaggedCorpusReader,
|
||||
r'(spoken|written)/.*\.txt',
|
||||
cat_file='categories.txt',
|
||||
tagset='wsj',
|
||||
encoding="utf-8",
|
||||
sep="_",
|
||||
)
|
||||
movie_reviews = LazyCorpusLoader(
|
||||
'movie_reviews',
|
||||
CategorizedPlaintextCorpusReader,
|
||||
r'(?!\.).*\.txt',
|
||||
cat_pattern=r'(neg|pos)/.*',
|
||||
encoding='ascii',
|
||||
)
|
||||
multext_east = LazyCorpusLoader(
|
||||
'mte_teip5', MTECorpusReader, r'(oana).*\.xml', encoding="utf-8"
|
||||
)
|
||||
names = LazyCorpusLoader(
|
||||
'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii'
|
||||
)
|
||||
nps_chat = LazyCorpusLoader(
|
||||
'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml', tagset='wsj'
|
||||
)
|
||||
opinion_lexicon = LazyCorpusLoader(
|
||||
'opinion_lexicon',
|
||||
OpinionLexiconCorpusReader,
|
||||
r'(\w+)\-words\.txt',
|
||||
encoding='ISO-8859-2',
|
||||
)
|
||||
ppattach = LazyCorpusLoader(
|
||||
'ppattach', PPAttachmentCorpusReader, ['training', 'test', 'devset']
|
||||
)
|
||||
product_reviews_1 = LazyCorpusLoader(
|
||||
'product_reviews_1', ReviewsCorpusReader, r'^(?!Readme).*\.txt', encoding='utf8'
|
||||
)
|
||||
product_reviews_2 = LazyCorpusLoader(
|
||||
'product_reviews_2', ReviewsCorpusReader, r'^(?!Readme).*\.txt', encoding='utf8'
|
||||
)
|
||||
pros_cons = LazyCorpusLoader(
|
||||
'pros_cons',
|
||||
ProsConsCorpusReader,
|
||||
r'Integrated(Cons|Pros)\.txt',
|
||||
cat_pattern=r'Integrated(Cons|Pros)\.txt',
|
||||
encoding='ISO-8859-2',
|
||||
)
|
||||
ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
|
||||
'ptb',
|
||||
CategorizedBracketParseCorpusReader,
|
||||
r'(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG',
|
||||
cat_file='allcats.txt',
|
||||
tagset='wsj',
|
||||
)
|
||||
qc = LazyCorpusLoader(
|
||||
'qc', StringCategoryCorpusReader, ['train.txt', 'test.txt'], encoding='ISO-8859-2'
|
||||
)
|
||||
reuters = LazyCorpusLoader(
|
||||
'reuters',
|
||||
CategorizedPlaintextCorpusReader,
|
||||
'(training|test).*',
|
||||
cat_file='cats.txt',
|
||||
encoding='ISO-8859-2',
|
||||
)
|
||||
rte = LazyCorpusLoader('rte', RTECorpusReader, r'(?!\.).*\.xml')
|
||||
senseval = LazyCorpusLoader('senseval', SensevalCorpusReader, r'(?!\.).*\.pos')
|
||||
sentence_polarity = LazyCorpusLoader(
|
||||
'sentence_polarity',
|
||||
CategorizedSentencesCorpusReader,
|
||||
r'rt-polarity\.(neg|pos)',
|
||||
cat_pattern=r'rt-polarity\.(neg|pos)',
|
||||
encoding='utf-8',
|
||||
)
|
||||
sentiwordnet = LazyCorpusLoader(
|
||||
'sentiwordnet', SentiWordNetCorpusReader, 'SentiWordNet_3.0.0.txt', encoding='utf-8'
|
||||
)
|
||||
shakespeare = LazyCorpusLoader('shakespeare', XMLCorpusReader, r'(?!\.).*\.xml')
|
||||
sinica_treebank = LazyCorpusLoader(
|
||||
'sinica_treebank',
|
||||
SinicaTreebankCorpusReader,
|
||||
['parsed'],
|
||||
tagset='unknown',
|
||||
encoding='utf-8',
|
||||
)
|
||||
state_union = LazyCorpusLoader(
|
||||
'state_union', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='ISO-8859-2'
|
||||
)
|
||||
stopwords = LazyCorpusLoader(
|
||||
'stopwords', WordListCorpusReader, r'(?!README|\.).*', encoding='utf8'
|
||||
)
|
||||
subjectivity = LazyCorpusLoader(
|
||||
'subjectivity',
|
||||
CategorizedSentencesCorpusReader,
|
||||
r'(quote.tok.gt9|plot.tok.gt9)\.5000',
|
||||
cat_map={'quote.tok.gt9.5000': ['subj'], 'plot.tok.gt9.5000': ['obj']},
|
||||
encoding='latin-1',
|
||||
)
|
||||
swadesh = LazyCorpusLoader(
|
||||
'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8'
|
||||
)
|
||||
swadesh110 = LazyCorpusLoader(
|
||||
'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8'
|
||||
)
|
||||
swadesh207 = LazyCorpusLoader(
|
||||
'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8'
|
||||
)
|
||||
switchboard = LazyCorpusLoader('switchboard', SwitchboardCorpusReader, tagset='wsj')
|
||||
timit = LazyCorpusLoader('timit', TimitCorpusReader)
|
||||
timit_tagged = LazyCorpusLoader(
|
||||
'timit', TimitTaggedCorpusReader, '.+\.tags', tagset='wsj', encoding='ascii'
|
||||
)
|
||||
toolbox = LazyCorpusLoader(
|
||||
'toolbox', ToolboxCorpusReader, r'(?!.*(README|\.)).*\.(dic|txt)'
|
||||
)
|
||||
treebank = LazyCorpusLoader(
|
||||
'treebank/combined',
|
||||
BracketParseCorpusReader,
|
||||
r'wsj_.*\.mrg',
|
||||
tagset='wsj',
|
||||
encoding='ascii',
|
||||
)
|
||||
treebank_chunk = LazyCorpusLoader(
|
||||
'treebank/tagged',
|
||||
ChunkedCorpusReader,
|
||||
r'wsj_.*\.pos',
|
||||
sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True),
|
||||
para_block_reader=tagged_treebank_para_block_reader,
|
||||
tagset='wsj',
|
||||
encoding='ascii',
|
||||
)
|
||||
treebank_raw = LazyCorpusLoader(
|
||||
'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2'
|
||||
)
|
||||
twitter_samples = LazyCorpusLoader('twitter_samples', TwitterCorpusReader, '.*\.json')
|
||||
udhr = LazyCorpusLoader('udhr', UdhrCorpusReader)
|
||||
udhr2 = LazyCorpusLoader('udhr2', PlaintextCorpusReader, r'.*\.txt', encoding='utf8')
|
||||
universal_treebanks = LazyCorpusLoader(
|
||||
'universal_treebanks_v20',
|
||||
ConllCorpusReader,
|
||||
r'.*\.conll',
|
||||
columntypes=(
|
||||
'ignore',
|
||||
'words',
|
||||
'ignore',
|
||||
'ignore',
|
||||
'pos',
|
||||
'ignore',
|
||||
'ignore',
|
||||
'ignore',
|
||||
'ignore',
|
||||
'ignore',
|
||||
),
|
||||
)
|
||||
verbnet = LazyCorpusLoader('verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml')
|
||||
webtext = LazyCorpusLoader(
|
||||
'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2'
|
||||
)
|
||||
wordnet = LazyCorpusLoader(
|
||||
'wordnet',
|
||||
WordNetCorpusReader,
|
||||
LazyCorpusLoader('omw', CorpusReader, r'.*/wn-data-.*\.tab', encoding='utf8'),
|
||||
)
|
||||
wordnet_ic = LazyCorpusLoader('wordnet_ic', WordNetICCorpusReader, '.*\.dat')
|
||||
words = LazyCorpusLoader(
|
||||
'words', WordListCorpusReader, r'(?!README|\.).*', encoding='ascii'
|
||||
)
|
||||
|
||||
# defined after treebank
|
||||
propbank = LazyCorpusLoader(
|
||||
'propbank',
|
||||
PropbankCorpusReader,
|
||||
'prop.txt',
|
||||
'frames/.*\.xml',
|
||||
'verbs.txt',
|
||||
lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
|
||||
treebank,
|
||||
) # Must be defined *after* treebank corpus.
|
||||
nombank = LazyCorpusLoader(
|
||||
'nombank.1.0',
|
||||
NombankCorpusReader,
|
||||
'nombank.1.0',
|
||||
'frames/.*\.xml',
|
||||
'nombank.1.0.words',
|
||||
lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
|
||||
treebank,
|
||||
) # Must be defined *after* treebank corpus.
|
||||
propbank_ptb = LazyCorpusLoader(
|
||||
'propbank',
|
||||
PropbankCorpusReader,
|
||||
'prop.txt',
|
||||
'frames/.*\.xml',
|
||||
'verbs.txt',
|
||||
lambda filename: filename.upper(),
|
||||
ptb,
|
||||
) # Must be defined *after* ptb corpus.
|
||||
nombank_ptb = LazyCorpusLoader(
|
||||
'nombank.1.0',
|
||||
NombankCorpusReader,
|
||||
'nombank.1.0',
|
||||
'frames/.*\.xml',
|
||||
'nombank.1.0.words',
|
||||
lambda filename: filename.upper(),
|
||||
ptb,
|
||||
) # Must be defined *after* ptb corpus.
|
||||
semcor = LazyCorpusLoader(
|
||||
'semcor', SemcorCorpusReader, r'brown./tagfiles/br-.*\.xml', wordnet
|
||||
) # Must be defined *after* wordnet corpus.
|
||||
|
||||
nonbreaking_prefixes = LazyCorpusLoader(
|
||||
'nonbreaking_prefixes',
|
||||
NonbreakingPrefixesCorpusReader,
|
||||
r'(?!README|\.).*',
|
||||
encoding='utf8',
|
||||
)
|
||||
perluniprops = LazyCorpusLoader(
|
||||
'perluniprops',
|
||||
UnicharsCorpusReader,
|
||||
r'(?!README|\.).*',
|
||||
nltk_data_subdir='misc',
|
||||
encoding='utf8',
|
||||
)
|
||||
|
||||
# mwa_ppdb = LazyCorpusLoader(
|
||||
# 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
|
||||
|
||||
# See https://github.com/nltk/nltk/issues/1579
|
||||
# and https://github.com/nltk/nltk/issues/1716
|
||||
#
|
||||
# pl196x = LazyCorpusLoader(
|
||||
# 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
|
||||
# cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
|
||||
#
|
||||
# ipipan = LazyCorpusLoader(
|
||||
# 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
|
||||
#
|
||||
# nkjp = LazyCorpusLoader(
|
||||
# 'nkjp', NKJPCorpusReader, r'', encoding='utf8')
|
||||
#
|
||||
# panlex_lite = LazyCorpusLoader(
|
||||
# 'panlex_lite', PanLexLiteCorpusReader)
|
||||
#
|
||||
# ycoe = LazyCorpusLoader(
|
||||
# 'ycoe', YCOECorpusReader)
|
||||
#
|
||||
# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
|
||||
# hebrew_treebank = LazyCorpusLoader(
|
||||
# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
|
||||
|
||||
# FIXME: override any imported demo from various corpora, see https://github.com/nltk/nltk/issues/2116
|
||||
def demo():
|
||||
# This is out-of-date:
|
||||
abc.demo()
|
||||
brown.demo()
|
||||
# chat80.demo()
|
||||
cmudict.demo()
|
||||
conll2000.demo()
|
||||
conll2002.demo()
|
||||
genesis.demo()
|
||||
gutenberg.demo()
|
||||
ieer.demo()
|
||||
inaugural.demo()
|
||||
indian.demo()
|
||||
names.demo()
|
||||
ppattach.demo()
|
||||
senseval.demo()
|
||||
shakespeare.demo()
|
||||
sinica_treebank.demo()
|
||||
state_union.demo()
|
||||
stopwords.demo()
|
||||
timit.demo()
|
||||
toolbox.demo()
|
||||
treebank.demo()
|
||||
udhr.demo()
|
||||
webtext.demo()
|
||||
words.demo()
|
||||
|
||||
|
||||
# ycoe.demo()
|
||||
|
||||
if __name__ == '__main__':
|
||||
# demo()
|
||||
pass
|
||||
|
||||
# ** this is for nose **
|
||||
# unload all corpus after tests
|
||||
def teardown_module(module=None):
|
||||
import nltk.corpus
|
||||
|
||||
for name in dir(nltk.corpus):
|
||||
obj = getattr(nltk.corpus, name, None)
|
||||
if isinstance(obj, CorpusReader) and hasattr(obj, '_unload'):
|
||||
obj._unload()
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
55
venv/lib/python3.7/site-packages/nltk/corpus/europarl_raw.py
Normal file
55
venv/lib/python3.7/site-packages/nltk/corpus/europarl_raw.py
Normal file
@@ -0,0 +1,55 @@
|
||||
# Natural Language Toolkit: Europarl Corpus Readers
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Nitin Madnani <nmadnani@umiacs.umd.edu>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import re
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
from nltk.corpus.reader import *
|
||||
|
||||
# Create a new corpus reader instance for each European language
|
||||
danish = LazyCorpusLoader(
|
||||
'europarl_raw/danish', EuroparlCorpusReader, r'ep-.*\.da', encoding='utf-8'
|
||||
)
|
||||
|
||||
dutch = LazyCorpusLoader(
|
||||
'europarl_raw/dutch', EuroparlCorpusReader, r'ep-.*\.nl', encoding='utf-8'
|
||||
)
|
||||
|
||||
english = LazyCorpusLoader(
|
||||
'europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8'
|
||||
)
|
||||
|
||||
finnish = LazyCorpusLoader(
|
||||
'europarl_raw/finnish', EuroparlCorpusReader, r'ep-.*\.fi', encoding='utf-8'
|
||||
)
|
||||
|
||||
french = LazyCorpusLoader(
|
||||
'europarl_raw/french', EuroparlCorpusReader, r'ep-.*\.fr', encoding='utf-8'
|
||||
)
|
||||
|
||||
german = LazyCorpusLoader(
|
||||
'europarl_raw/german', EuroparlCorpusReader, r'ep-.*\.de', encoding='utf-8'
|
||||
)
|
||||
|
||||
greek = LazyCorpusLoader(
|
||||
'europarl_raw/greek', EuroparlCorpusReader, r'ep-.*\.el', encoding='utf-8'
|
||||
)
|
||||
|
||||
italian = LazyCorpusLoader(
|
||||
'europarl_raw/italian', EuroparlCorpusReader, r'ep-.*\.it', encoding='utf-8'
|
||||
)
|
||||
|
||||
portuguese = LazyCorpusLoader(
|
||||
'europarl_raw/portuguese', EuroparlCorpusReader, r'ep-.*\.pt', encoding='utf-8'
|
||||
)
|
||||
|
||||
spanish = LazyCorpusLoader(
|
||||
'europarl_raw/spanish', EuroparlCorpusReader, r'ep-.*\.es', encoding='utf-8'
|
||||
)
|
||||
|
||||
swedish = LazyCorpusLoader(
|
||||
'europarl_raw/swedish', EuroparlCorpusReader, r'ep-.*\.sv', encoding='utf-8'
|
||||
)
|
||||
183
venv/lib/python3.7/site-packages/nltk/corpus/reader/__init__.py
Normal file
183
venv/lib/python3.7/site-packages/nltk/corpus/reader/__init__.py
Normal file
@@ -0,0 +1,183 @@
|
||||
# Natural Language Toolkit: Corpus Readers
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
NLTK corpus readers. The modules in this package provide functions
|
||||
that can be used to read corpus fileids in a variety of formats. These
|
||||
functions can be used to read both the corpus fileids that are
|
||||
distributed in the NLTK corpus package, and corpus fileids that are part
|
||||
of external corpora.
|
||||
|
||||
Corpus Reader Functions
|
||||
=======================
|
||||
Each corpus module defines one or more "corpus reader functions",
|
||||
which can be used to read documents from that corpus. These functions
|
||||
take an argument, ``item``, which is used to indicate which document
|
||||
should be read from the corpus:
|
||||
|
||||
- If ``item`` is one of the unique identifiers listed in the corpus
|
||||
module's ``items`` variable, then the corresponding document will
|
||||
be loaded from the NLTK corpus package.
|
||||
- If ``item`` is a fileid, then that file will be read.
|
||||
|
||||
Additionally, corpus reader functions can be given lists of item
|
||||
names; in which case, they will return a concatenation of the
|
||||
corresponding documents.
|
||||
|
||||
Corpus reader functions are named based on the type of information
|
||||
they return. Some common examples, and their return types, are:
|
||||
|
||||
- words(): list of str
|
||||
- sents(): list of (list of str)
|
||||
- paras(): list of (list of (list of str))
|
||||
- tagged_words(): list of (str,str) tuple
|
||||
- tagged_sents(): list of (list of (str,str))
|
||||
- tagged_paras(): list of (list of (list of (str,str)))
|
||||
- chunked_sents(): list of (Tree w/ (str,str) leaves)
|
||||
- parsed_sents(): list of (Tree with str leaves)
|
||||
- parsed_paras(): list of (list of (Tree with str leaves))
|
||||
- xml(): A single xml ElementTree
|
||||
- raw(): unprocessed corpus contents
|
||||
|
||||
For example, to read a list of the words in the Brown Corpus, use
|
||||
``nltk.corpus.brown.words()``:
|
||||
|
||||
>>> from nltk.corpus import brown
|
||||
>>> print(", ".join(brown.words()))
|
||||
The, Fulton, County, Grand, Jury, said, ...
|
||||
|
||||
"""
|
||||
|
||||
from nltk.corpus.reader.plaintext import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.tagged import *
|
||||
from nltk.corpus.reader.cmudict import *
|
||||
from nltk.corpus.reader.conll import *
|
||||
from nltk.corpus.reader.chunked import *
|
||||
from nltk.corpus.reader.wordlist import *
|
||||
from nltk.corpus.reader.xmldocs import *
|
||||
from nltk.corpus.reader.ppattach import *
|
||||
from nltk.corpus.reader.senseval import *
|
||||
from nltk.corpus.reader.ieer import *
|
||||
from nltk.corpus.reader.sinica_treebank import *
|
||||
from nltk.corpus.reader.bracket_parse import *
|
||||
from nltk.corpus.reader.indian import *
|
||||
from nltk.corpus.reader.toolbox import *
|
||||
from nltk.corpus.reader.timit import *
|
||||
from nltk.corpus.reader.ycoe import *
|
||||
from nltk.corpus.reader.rte import *
|
||||
from nltk.corpus.reader.string_category import *
|
||||
from nltk.corpus.reader.propbank import *
|
||||
from nltk.corpus.reader.verbnet import *
|
||||
from nltk.corpus.reader.bnc import *
|
||||
from nltk.corpus.reader.nps_chat import *
|
||||
from nltk.corpus.reader.wordnet import *
|
||||
from nltk.corpus.reader.switchboard import *
|
||||
from nltk.corpus.reader.dependency import *
|
||||
from nltk.corpus.reader.nombank import *
|
||||
from nltk.corpus.reader.ipipan import *
|
||||
from nltk.corpus.reader.pl196x import *
|
||||
from nltk.corpus.reader.knbc import *
|
||||
from nltk.corpus.reader.chasen import *
|
||||
from nltk.corpus.reader.childes import *
|
||||
from nltk.corpus.reader.aligned import *
|
||||
from nltk.corpus.reader.lin import *
|
||||
from nltk.corpus.reader.semcor import *
|
||||
from nltk.corpus.reader.framenet import *
|
||||
from nltk.corpus.reader.udhr import *
|
||||
from nltk.corpus.reader.bnc import *
|
||||
from nltk.corpus.reader.sentiwordnet import *
|
||||
from nltk.corpus.reader.twitter import *
|
||||
from nltk.corpus.reader.nkjp import *
|
||||
from nltk.corpus.reader.crubadan import *
|
||||
from nltk.corpus.reader.mte import *
|
||||
from nltk.corpus.reader.reviews import *
|
||||
from nltk.corpus.reader.opinion_lexicon import *
|
||||
from nltk.corpus.reader.pros_cons import *
|
||||
from nltk.corpus.reader.categorized_sents import *
|
||||
from nltk.corpus.reader.comparative_sents import *
|
||||
from nltk.corpus.reader.panlex_lite import *
|
||||
from nltk.corpus.reader.panlex_swadesh import *
|
||||
|
||||
# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
|
||||
# the function bracket_parse() defined in nltk.tree:
|
||||
from nltk.corpus.reader import bracket_parse
|
||||
|
||||
__all__ = [
|
||||
'CorpusReader',
|
||||
'CategorizedCorpusReader',
|
||||
'PlaintextCorpusReader',
|
||||
'find_corpus_fileids',
|
||||
'TaggedCorpusReader',
|
||||
'CMUDictCorpusReader',
|
||||
'ConllChunkCorpusReader',
|
||||
'WordListCorpusReader',
|
||||
'PPAttachmentCorpusReader',
|
||||
'SensevalCorpusReader',
|
||||
'IEERCorpusReader',
|
||||
'ChunkedCorpusReader',
|
||||
'SinicaTreebankCorpusReader',
|
||||
'BracketParseCorpusReader',
|
||||
'IndianCorpusReader',
|
||||
'ToolboxCorpusReader',
|
||||
'TimitCorpusReader',
|
||||
'YCOECorpusReader',
|
||||
'MacMorphoCorpusReader',
|
||||
'SyntaxCorpusReader',
|
||||
'AlpinoCorpusReader',
|
||||
'RTECorpusReader',
|
||||
'StringCategoryCorpusReader',
|
||||
'EuroparlCorpusReader',
|
||||
'CategorizedBracketParseCorpusReader',
|
||||
'CategorizedTaggedCorpusReader',
|
||||
'CategorizedPlaintextCorpusReader',
|
||||
'PortugueseCategorizedPlaintextCorpusReader',
|
||||
'tagged_treebank_para_block_reader',
|
||||
'PropbankCorpusReader',
|
||||
'VerbnetCorpusReader',
|
||||
'BNCCorpusReader',
|
||||
'ConllCorpusReader',
|
||||
'XMLCorpusReader',
|
||||
'NPSChatCorpusReader',
|
||||
'SwadeshCorpusReader',
|
||||
'WordNetCorpusReader',
|
||||
'WordNetICCorpusReader',
|
||||
'SwitchboardCorpusReader',
|
||||
'DependencyCorpusReader',
|
||||
'NombankCorpusReader',
|
||||
'IPIPANCorpusReader',
|
||||
'Pl196xCorpusReader',
|
||||
'TEICorpusView',
|
||||
'KNBCorpusReader',
|
||||
'ChasenCorpusReader',
|
||||
'CHILDESCorpusReader',
|
||||
'AlignedCorpusReader',
|
||||
'TimitTaggedCorpusReader',
|
||||
'LinThesaurusCorpusReader',
|
||||
'SemcorCorpusReader',
|
||||
'FramenetCorpusReader',
|
||||
'UdhrCorpusReader',
|
||||
'BNCCorpusReader',
|
||||
'SentiWordNetCorpusReader',
|
||||
'SentiSynset',
|
||||
'TwitterCorpusReader',
|
||||
'NKJPCorpusReader',
|
||||
'CrubadanCorpusReader',
|
||||
'MTECorpusReader',
|
||||
'ReviewsCorpusReader',
|
||||
'OpinionLexiconCorpusReader',
|
||||
'ProsConsCorpusReader',
|
||||
'CategorizedSentencesCorpusReader',
|
||||
'ComparativeSentencesCorpusReader',
|
||||
'PanLexLiteCorpusReader',
|
||||
'NonbreakingPrefixesCorpusReader',
|
||||
'UnicharsCorpusReader',
|
||||
'MWAPPDBCorpusReader',
|
||||
'PanlexSwadeshCorpusReader',
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
168
venv/lib/python3.7/site-packages/nltk/corpus/reader/aligned.py
Normal file
168
venv/lib/python3.7/site-packages/nltk/corpus/reader/aligned.py
Normal file
@@ -0,0 +1,168 @@
|
||||
# Natural Language Toolkit: Aligned Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# URL: <http://nltk.org/>
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk.tokenize import WhitespaceTokenizer, RegexpTokenizer
|
||||
from nltk.translate import AlignedSent, Alignment
|
||||
|
||||
from nltk.corpus.reader.api import CorpusReader
|
||||
from nltk.corpus.reader.util import (
|
||||
StreamBackedCorpusView,
|
||||
concat,
|
||||
read_alignedsent_block,
|
||||
)
|
||||
|
||||
|
||||
class AlignedCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for corpora of word-aligned sentences. Tokens are assumed
|
||||
to be separated by whitespace. Sentences begin on separate lines.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
sep='/',
|
||||
word_tokenizer=WhitespaceTokenizer(),
|
||||
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
|
||||
alignedsent_block_reader=read_alignedsent_block,
|
||||
encoding='latin1',
|
||||
):
|
||||
"""
|
||||
Construct a new Aligned Corpus reader for a set of documents
|
||||
located at the given root directory. Example usage:
|
||||
|
||||
>>> root = '/...path to corpus.../'
|
||||
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
|
||||
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
"""
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._sep = sep
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._alignedsent_block_reader = alignedsent_block_reader
|
||||
|
||||
def raw(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a single string.
|
||||
:rtype: str
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of words
|
||||
and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
AlignedSentCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
False,
|
||||
False,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._alignedsent_block_reader,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences or utterances, each encoded as a list of word
|
||||
strings.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
AlignedSentCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
False,
|
||||
True,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._alignedsent_block_reader,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def aligned_sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of AlignedSent objects.
|
||||
:rtype: list(AlignedSent)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
AlignedSentCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
True,
|
||||
True,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._alignedsent_block_reader,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class AlignedSentCorpusView(StreamBackedCorpusView):
|
||||
"""
|
||||
A specialized corpus view for aligned sentences.
|
||||
``AlignedSentCorpusView`` objects are typically created by
|
||||
``AlignedCorpusReader`` (not directly by nltk users).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
corpus_file,
|
||||
encoding,
|
||||
aligned,
|
||||
group_by_sent,
|
||||
word_tokenizer,
|
||||
sent_tokenizer,
|
||||
alignedsent_block_reader,
|
||||
):
|
||||
self._aligned = aligned
|
||||
self._group_by_sent = group_by_sent
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._alignedsent_block_reader = alignedsent_block_reader
|
||||
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
||||
|
||||
def read_block(self, stream):
|
||||
block = [
|
||||
self._word_tokenizer.tokenize(sent_str)
|
||||
for alignedsent_str in self._alignedsent_block_reader(stream)
|
||||
for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)
|
||||
]
|
||||
if self._aligned:
|
||||
block[2] = Alignment.fromstring(
|
||||
" ".join(block[2])
|
||||
) # kludge; we shouldn't have tokenized the alignment string
|
||||
block = [AlignedSent(*block)]
|
||||
elif self._group_by_sent:
|
||||
block = [block[0]]
|
||||
else:
|
||||
block = block[0]
|
||||
|
||||
return block
|
||||
484
venv/lib/python3.7/site-packages/nltk/corpus/reader/api.py
Normal file
484
venv/lib/python3.7/site-packages/nltk/corpus/reader/api.py
Normal file
@@ -0,0 +1,484 @@
|
||||
# Natural Language Toolkit: API for Corpus Readers
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
API for corpus readers.
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from itertools import chain
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk import compat
|
||||
from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer
|
||||
|
||||
from nltk.corpus.reader.util import *
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class CorpusReader(object):
|
||||
"""
|
||||
A base class for "corpus reader" classes, each of which can be
|
||||
used to read a specific corpus format. Each individual corpus
|
||||
reader instance is used to read a specific corpus, consisting of
|
||||
one or more files under a common root directory. Each file is
|
||||
identified by its ``file identifier``, which is the relative path
|
||||
to the file from the root directory.
|
||||
|
||||
A separate subclass is defined for each corpus format. These
|
||||
subclasses define one or more methods that provide 'views' on the
|
||||
corpus contents, such as ``words()`` (for a list of words) and
|
||||
``parsed_sents()`` (for a list of parsed sentences). Called with
|
||||
no arguments, these methods will return the contents of the entire
|
||||
corpus. For most corpora, these methods define one or more
|
||||
selection arguments, such as ``fileids`` or ``categories``, which can
|
||||
be used to select which portion of the corpus should be returned.
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, encoding='utf8', tagset=None):
|
||||
"""
|
||||
:type root: PathPointer or str
|
||||
:param root: A path pointer identifying the root directory for
|
||||
this corpus. If a string is specified, then it will be
|
||||
converted to a ``PathPointer`` automatically.
|
||||
:param fileids: A list of the files that make up this corpus.
|
||||
This list can either be specified explicitly, as a list of
|
||||
strings; or implicitly, as a regular expression over file
|
||||
paths. The absolute path for each file will be constructed
|
||||
by joining the reader's root to each file name.
|
||||
:param encoding: The default unicode encoding for the files
|
||||
that make up the corpus. The value of ``encoding`` can be any
|
||||
of the following:
|
||||
- A string: ``encoding`` is the encoding name for all files.
|
||||
- A dictionary: ``encoding[file_id]`` is the encoding
|
||||
name for the file whose identifier is ``file_id``. If
|
||||
``file_id`` is not in ``encoding``, then the file
|
||||
contents will be processed using non-unicode byte strings.
|
||||
- A list: ``encoding`` should be a list of ``(regexp, encoding)``
|
||||
tuples. The encoding for a file whose identifier is ``file_id``
|
||||
will be the ``encoding`` value for the first tuple whose
|
||||
``regexp`` matches the ``file_id``. If no tuple's ``regexp``
|
||||
matches the ``file_id``, the file contents will be processed
|
||||
using non-unicode byte strings.
|
||||
- None: the file contents of all files will be
|
||||
processed using non-unicode byte strings.
|
||||
:param tagset: The name of the tagset used by this corpus, to be used
|
||||
for normalizing or converting the POS tags returned by the
|
||||
tagged_...() methods.
|
||||
"""
|
||||
# Convert the root to a path pointer, if necessary.
|
||||
if isinstance(root, string_types) and not isinstance(root, PathPointer):
|
||||
m = re.match('(.*\.zip)/?(.*)$|', root)
|
||||
zipfile, zipentry = m.groups()
|
||||
if zipfile:
|
||||
root = ZipFilePathPointer(zipfile, zipentry)
|
||||
else:
|
||||
root = FileSystemPathPointer(root)
|
||||
elif not isinstance(root, PathPointer):
|
||||
raise TypeError('CorpusReader: expected a string or a PathPointer')
|
||||
|
||||
# If `fileids` is a regexp, then expand it.
|
||||
if isinstance(fileids, string_types):
|
||||
fileids = find_corpus_fileids(root, fileids)
|
||||
|
||||
self._fileids = fileids
|
||||
"""A list of the relative paths for the fileids that make up
|
||||
this corpus."""
|
||||
|
||||
self._root = root
|
||||
"""The root directory for this corpus."""
|
||||
|
||||
# If encoding was specified as a list of regexps, then convert
|
||||
# it to a dictionary.
|
||||
if isinstance(encoding, list):
|
||||
encoding_dict = {}
|
||||
for fileid in self._fileids:
|
||||
for x in encoding:
|
||||
(regexp, enc) = x
|
||||
if re.match(regexp, fileid):
|
||||
encoding_dict[fileid] = enc
|
||||
break
|
||||
encoding = encoding_dict
|
||||
|
||||
self._encoding = encoding
|
||||
"""The default unicode encoding for the fileids that make up
|
||||
this corpus. If ``encoding`` is None, then the file
|
||||
contents are processed using byte strings."""
|
||||
self._tagset = tagset
|
||||
|
||||
def __repr__(self):
|
||||
if isinstance(self._root, ZipFilePathPointer):
|
||||
path = '%s/%s' % (self._root.zipfile.filename, self._root.entry)
|
||||
else:
|
||||
path = '%s' % self._root.path
|
||||
return '<%s in %r>' % (self.__class__.__name__, path)
|
||||
|
||||
def ensure_loaded(self):
|
||||
"""
|
||||
Load this corpus (if it has not already been loaded). This is
|
||||
used by LazyCorpusLoader as a simple method that can be used to
|
||||
make sure a corpus is loaded -- e.g., in case a user wants to
|
||||
do help(some_corpus).
|
||||
"""
|
||||
pass # no need to actually do anything.
|
||||
|
||||
def readme(self):
|
||||
"""
|
||||
Return the contents of the corpus README file, if it exists.
|
||||
"""
|
||||
return self.open("README").read()
|
||||
|
||||
def license(self):
|
||||
"""
|
||||
Return the contents of the corpus LICENSE file, if it exists.
|
||||
"""
|
||||
return self.open("LICENSE").read()
|
||||
|
||||
def citation(self):
|
||||
"""
|
||||
Return the contents of the corpus citation.bib file, if it exists.
|
||||
"""
|
||||
return self.open("citation.bib").read()
|
||||
|
||||
def fileids(self):
|
||||
"""
|
||||
Return a list of file identifiers for the fileids that make up
|
||||
this corpus.
|
||||
"""
|
||||
return self._fileids
|
||||
|
||||
def abspath(self, fileid):
|
||||
"""
|
||||
Return the absolute path for the given file.
|
||||
|
||||
:type fileid: str
|
||||
:param fileid: The file identifier for the file whose path
|
||||
should be returned.
|
||||
:rtype: PathPointer
|
||||
"""
|
||||
return self._root.join(fileid)
|
||||
|
||||
def abspaths(self, fileids=None, include_encoding=False, include_fileid=False):
|
||||
"""
|
||||
Return a list of the absolute paths for all fileids in this corpus;
|
||||
or for the given list of fileids, if specified.
|
||||
|
||||
:type fileids: None or str or list
|
||||
:param fileids: Specifies the set of fileids for which paths should
|
||||
be returned. Can be None, for all fileids; a list of
|
||||
file identifiers, for a specified set of fileids; or a single
|
||||
file identifier, for a single file. Note that the return
|
||||
value is always a list of paths, even if ``fileids`` is a
|
||||
single file identifier.
|
||||
|
||||
:param include_encoding: If true, then return a list of
|
||||
``(path_pointer, encoding)`` tuples.
|
||||
|
||||
:rtype: list(PathPointer)
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
|
||||
paths = [self._root.join(f) for f in fileids]
|
||||
|
||||
if include_encoding and include_fileid:
|
||||
return list(zip(paths, [self.encoding(f) for f in fileids], fileids))
|
||||
elif include_fileid:
|
||||
return list(zip(paths, fileids))
|
||||
elif include_encoding:
|
||||
return list(zip(paths, [self.encoding(f) for f in fileids]))
|
||||
else:
|
||||
return paths
|
||||
|
||||
def open(self, file):
|
||||
"""
|
||||
Return an open stream that can be used to read the given file.
|
||||
If the file's encoding is not None, then the stream will
|
||||
automatically decode the file's contents into unicode.
|
||||
|
||||
:param file: The file identifier of the file to read.
|
||||
"""
|
||||
encoding = self.encoding(file)
|
||||
stream = self._root.join(file).open(encoding)
|
||||
return stream
|
||||
|
||||
def encoding(self, file):
|
||||
"""
|
||||
Return the unicode encoding for the given corpus file, if known.
|
||||
If the encoding is unknown, or if the given file should be
|
||||
processed using byte strings (str), then return None.
|
||||
"""
|
||||
if isinstance(self._encoding, dict):
|
||||
return self._encoding.get(file)
|
||||
else:
|
||||
return self._encoding
|
||||
|
||||
def _get_root(self):
|
||||
return self._root
|
||||
|
||||
root = property(
|
||||
_get_root,
|
||||
doc="""
|
||||
The directory where this corpus is stored.
|
||||
|
||||
:type: PathPointer""",
|
||||
)
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Corpora containing categorized items
|
||||
######################################################################
|
||||
|
||||
|
||||
class CategorizedCorpusReader(object):
|
||||
"""
|
||||
A mixin class used to aid in the implementation of corpus readers
|
||||
for categorized corpora. This class defines the method
|
||||
``categories()``, which returns a list of the categories for the
|
||||
corpus or for a specified set of fileids; and overrides ``fileids()``
|
||||
to take a ``categories`` argument, restricting the set of fileids to
|
||||
be returned.
|
||||
|
||||
Subclasses are expected to:
|
||||
|
||||
- Call ``__init__()`` to set up the mapping.
|
||||
|
||||
- Override all view methods to accept a ``categories`` parameter,
|
||||
which can be used *instead* of the ``fileids`` parameter, to
|
||||
select which fileids should be included in the returned view.
|
||||
"""
|
||||
|
||||
def __init__(self, kwargs):
|
||||
"""
|
||||
Initialize this mapping based on keyword arguments, as
|
||||
follows:
|
||||
|
||||
- cat_pattern: A regular expression pattern used to find the
|
||||
category for each file identifier. The pattern will be
|
||||
applied to each file identifier, and the first matching
|
||||
group will be used as the category label for that file.
|
||||
|
||||
- cat_map: A dictionary, mapping from file identifiers to
|
||||
category labels.
|
||||
|
||||
- cat_file: The name of a file that contains the mapping
|
||||
from file identifiers to categories. The argument
|
||||
``cat_delimiter`` can be used to specify a delimiter.
|
||||
|
||||
The corresponding argument will be deleted from ``kwargs``. If
|
||||
more than one argument is specified, an exception will be
|
||||
raised.
|
||||
"""
|
||||
self._f2c = None #: file-to-category mapping
|
||||
self._c2f = None #: category-to-file mapping
|
||||
|
||||
self._pattern = None #: regexp specifying the mapping
|
||||
self._map = None #: dict specifying the mapping
|
||||
self._file = None #: fileid of file containing the mapping
|
||||
self._delimiter = None #: delimiter for ``self._file``
|
||||
|
||||
if 'cat_pattern' in kwargs:
|
||||
self._pattern = kwargs['cat_pattern']
|
||||
del kwargs['cat_pattern']
|
||||
elif 'cat_map' in kwargs:
|
||||
self._map = kwargs['cat_map']
|
||||
del kwargs['cat_map']
|
||||
elif 'cat_file' in kwargs:
|
||||
self._file = kwargs['cat_file']
|
||||
del kwargs['cat_file']
|
||||
if 'cat_delimiter' in kwargs:
|
||||
self._delimiter = kwargs['cat_delimiter']
|
||||
del kwargs['cat_delimiter']
|
||||
else:
|
||||
raise ValueError(
|
||||
'Expected keyword argument cat_pattern or ' 'cat_map or cat_file.'
|
||||
)
|
||||
|
||||
if 'cat_pattern' in kwargs or 'cat_map' in kwargs or 'cat_file' in kwargs:
|
||||
raise ValueError(
|
||||
'Specify exactly one of: cat_pattern, ' 'cat_map, cat_file.'
|
||||
)
|
||||
|
||||
def _init(self):
|
||||
self._f2c = defaultdict(set)
|
||||
self._c2f = defaultdict(set)
|
||||
|
||||
if self._pattern is not None:
|
||||
for file_id in self._fileids:
|
||||
category = re.match(self._pattern, file_id).group(1)
|
||||
self._add(file_id, category)
|
||||
|
||||
elif self._map is not None:
|
||||
for (file_id, categories) in self._map.items():
|
||||
for category in categories:
|
||||
self._add(file_id, category)
|
||||
|
||||
elif self._file is not None:
|
||||
for line in self.open(self._file).readlines():
|
||||
line = line.strip()
|
||||
file_id, categories = line.split(self._delimiter, 1)
|
||||
if file_id not in self.fileids():
|
||||
raise ValueError(
|
||||
'In category mapping file %s: %s '
|
||||
'not found' % (self._file, file_id)
|
||||
)
|
||||
for category in categories.split(self._delimiter):
|
||||
self._add(file_id, category)
|
||||
|
||||
def _add(self, file_id, category):
|
||||
self._f2c[file_id].add(category)
|
||||
self._c2f[category].add(file_id)
|
||||
|
||||
def categories(self, fileids=None):
|
||||
"""
|
||||
Return a list of the categories that are defined for this corpus,
|
||||
or for the file(s) if it is given.
|
||||
"""
|
||||
if self._f2c is None:
|
||||
self._init()
|
||||
if fileids is None:
|
||||
return sorted(self._c2f)
|
||||
if isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return sorted(set.union(*[self._f2c[d] for d in fileids]))
|
||||
|
||||
def fileids(self, categories=None):
|
||||
"""
|
||||
Return a list of file identifiers for the files that make up
|
||||
this corpus, or that make up the given category(s) if specified.
|
||||
"""
|
||||
if categories is None:
|
||||
return super(CategorizedCorpusReader, self).fileids()
|
||||
elif isinstance(categories, string_types):
|
||||
if self._f2c is None:
|
||||
self._init()
|
||||
if categories in self._c2f:
|
||||
return sorted(self._c2f[categories])
|
||||
else:
|
||||
raise ValueError('Category %s not found' % categories)
|
||||
else:
|
||||
if self._f2c is None:
|
||||
self._init()
|
||||
return sorted(set.union(*[self._c2f[c] for c in categories]))
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Treebank readers
|
||||
######################################################################
|
||||
|
||||
# [xx] is it worth it to factor this out?
|
||||
class SyntaxCorpusReader(CorpusReader):
|
||||
"""
|
||||
An abstract base class for reading corpora consisting of
|
||||
syntactically parsed text. Subclasses should define:
|
||||
|
||||
- ``__init__``, which specifies the location of the corpus
|
||||
and a method for detecting the sentence blocks in corpus files.
|
||||
- ``_read_block``, which reads a block from the input stream.
|
||||
- ``_word``, which takes a block and returns a list of list of words.
|
||||
- ``_tag``, which takes a block and returns a list of list of tagged
|
||||
words.
|
||||
- ``_parse``, which takes a block and returns a list of parsed
|
||||
sentences.
|
||||
"""
|
||||
|
||||
def _parse(self, s):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _word(self, s):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _tag(self, s):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _read_block(self, stream):
|
||||
raise NotImplementedError()
|
||||
|
||||
def raw(self, fileids=None):
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
def parsed_sents(self, fileids=None):
|
||||
reader = self._read_parsed_sent_block
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, reader, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, tagset=None):
|
||||
def reader(stream):
|
||||
return self._read_tagged_sent_block(stream, tagset)
|
||||
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, reader, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
reader = self._read_sent_block
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, reader, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None, tagset=None):
|
||||
def reader(stream):
|
||||
return self._read_tagged_word_block(stream, tagset)
|
||||
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, reader, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_word_block, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# { Block Readers
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
return list(chain(*self._read_sent_block(stream)))
|
||||
|
||||
def _read_tagged_word_block(self, stream, tagset=None):
|
||||
return list(chain(*self._read_tagged_sent_block(stream, tagset)))
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
return list(filter(None, [self._word(t) for t in self._read_block(stream)]))
|
||||
|
||||
def _read_tagged_sent_block(self, stream, tagset=None):
|
||||
return list(
|
||||
filter(None, [self._tag(t, tagset) for t in self._read_block(stream)])
|
||||
)
|
||||
|
||||
def _read_parsed_sent_block(self, stream):
|
||||
return list(filter(None, [self._parse(t) for t in self._read_block(stream)]))
|
||||
|
||||
# } End of Block Readers
|
||||
# ------------------------------------------------------------
|
||||
258
venv/lib/python3.7/site-packages/nltk/corpus/reader/bnc.py
Normal file
258
venv/lib/python3.7/site-packages/nltk/corpus/reader/bnc.py
Normal file
@@ -0,0 +1,258 @@
|
||||
# Natural Language Toolkit: Plaintext Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""Corpus reader for the XML version of the British National Corpus."""
|
||||
|
||||
from nltk.corpus.reader.util import concat
|
||||
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView, ElementTree
|
||||
|
||||
|
||||
class BNCCorpusReader(XMLCorpusReader):
|
||||
"""Corpus reader for the XML version of the British National Corpus.
|
||||
|
||||
For access to the complete XML data structure, use the ``xml()``
|
||||
method. For access to simple word lists and tagged word lists, use
|
||||
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
|
||||
|
||||
You can obtain the full version of the BNC corpus at
|
||||
http://www.ota.ox.ac.uk/desc/2554
|
||||
|
||||
If you extracted the archive to a directory called `BNC`, then you can
|
||||
instantiate the reader as::
|
||||
|
||||
BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, lazy=True):
|
||||
XMLCorpusReader.__init__(self, root, fileids)
|
||||
self._lazy = lazy
|
||||
|
||||
def words(self, fileids=None, strip_space=True, stem=False):
|
||||
"""
|
||||
:return: the given file(s) as a list of words
|
||||
and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
|
||||
:param strip_space: If true, then strip trailing spaces from
|
||||
word tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
"""
|
||||
return self._views(fileids, False, None, strip_space, stem)
|
||||
|
||||
def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
|
||||
"""
|
||||
:return: the given file(s) as a list of tagged
|
||||
words and punctuation symbols, encoded as tuples
|
||||
``(word,tag)``.
|
||||
:rtype: list(tuple(str,str))
|
||||
|
||||
:param c5: If true, then the tags used will be the more detailed
|
||||
c5 tags. Otherwise, the simplified tags will be used.
|
||||
:param strip_space: If true, then strip trailing spaces from
|
||||
word tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
"""
|
||||
tag = 'c5' if c5 else 'pos'
|
||||
return self._views(fileids, False, tag, strip_space, stem)
|
||||
|
||||
def sents(self, fileids=None, strip_space=True, stem=False):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences or utterances, each encoded as a list of word
|
||||
strings.
|
||||
:rtype: list(list(str))
|
||||
|
||||
:param strip_space: If true, then strip trailing spaces from
|
||||
word tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
"""
|
||||
return self._views(fileids, True, None, strip_space, stem)
|
||||
|
||||
def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences, each encoded as a list of ``(word,tag)`` tuples.
|
||||
:rtype: list(list(tuple(str,str)))
|
||||
|
||||
:param c5: If true, then the tags used will be the more detailed
|
||||
c5 tags. Otherwise, the simplified tags will be used.
|
||||
:param strip_space: If true, then strip trailing spaces from
|
||||
word tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
"""
|
||||
tag = 'c5' if c5 else 'pos'
|
||||
return self._views(
|
||||
fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
|
||||
)
|
||||
|
||||
def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
|
||||
"""A helper function that instantiates BNCWordViews or the list of words/sentences."""
|
||||
f = BNCWordView if self._lazy else self._words
|
||||
return concat(
|
||||
[
|
||||
f(fileid, sent, tag, strip_space, stem)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def _words(self, fileid, bracket_sent, tag, strip_space, stem):
|
||||
"""
|
||||
Helper used to implement the view methods -- returns a list of
|
||||
words or a list of sentences, optionally tagged.
|
||||
|
||||
:param fileid: The name of the underlying file.
|
||||
:param bracket_sent: If true, include sentence bracketing.
|
||||
:param tag: The name of the tagset to use, or None for no tags.
|
||||
:param strip_space: If true, strip spaces from word tokens.
|
||||
:param stem: If true, then substitute stems for words.
|
||||
"""
|
||||
result = []
|
||||
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
for xmlsent in xmldoc.findall('.//s'):
|
||||
sent = []
|
||||
for xmlword in _all_xmlwords_in(xmlsent):
|
||||
word = xmlword.text
|
||||
if not word:
|
||||
word = "" # fixes issue 337?
|
||||
if strip_space or stem:
|
||||
word = word.strip()
|
||||
if stem:
|
||||
word = xmlword.get('hw', word)
|
||||
if tag == 'c5':
|
||||
word = (word, xmlword.get('c5'))
|
||||
elif tag == 'pos':
|
||||
word = (word, xmlword.get('pos', xmlword.get('c5')))
|
||||
sent.append(word)
|
||||
if bracket_sent:
|
||||
result.append(BNCSentence(xmlsent.attrib['n'], sent))
|
||||
else:
|
||||
result.extend(sent)
|
||||
|
||||
assert None not in result
|
||||
return result
|
||||
|
||||
|
||||
def _all_xmlwords_in(elt, result=None):
|
||||
if result is None:
|
||||
result = []
|
||||
for child in elt:
|
||||
if child.tag in ('c', 'w'):
|
||||
result.append(child)
|
||||
else:
|
||||
_all_xmlwords_in(child, result)
|
||||
return result
|
||||
|
||||
|
||||
class BNCSentence(list):
|
||||
"""
|
||||
A list of words, augmented by an attribute ``num`` used to record
|
||||
the sentence identifier (the ``n`` attribute from the XML).
|
||||
"""
|
||||
|
||||
def __init__(self, num, items):
|
||||
self.num = num
|
||||
list.__init__(self, items)
|
||||
|
||||
|
||||
class BNCWordView(XMLCorpusView):
|
||||
"""
|
||||
A stream backed corpus view specialized for use with the BNC corpus.
|
||||
"""
|
||||
|
||||
tags_to_ignore = set(
|
||||
['pb', 'gap', 'vocal', 'event', 'unclear', 'shift', 'pause', 'align']
|
||||
)
|
||||
"""These tags are ignored. For their description refer to the
|
||||
technical documentation, for example,
|
||||
http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, fileid, sent, tag, strip_space, stem):
|
||||
"""
|
||||
:param fileid: The name of the underlying file.
|
||||
:param sent: If true, include sentence bracketing.
|
||||
:param tag: The name of the tagset to use, or None for no tags.
|
||||
:param strip_space: If true, strip spaces from word tokens.
|
||||
:param stem: If true, then substitute stems for words.
|
||||
"""
|
||||
if sent:
|
||||
tagspec = '.*/s'
|
||||
else:
|
||||
tagspec = '.*/s/(.*/)?(c|w)'
|
||||
self._sent = sent
|
||||
self._tag = tag
|
||||
self._strip_space = strip_space
|
||||
self._stem = stem
|
||||
|
||||
self.title = None #: Title of the document.
|
||||
self.author = None #: Author of the document.
|
||||
self.editor = None #: Editor
|
||||
self.resps = None #: Statement of responsibility
|
||||
|
||||
XMLCorpusView.__init__(self, fileid, tagspec)
|
||||
|
||||
# Read in a tasty header.
|
||||
self._open()
|
||||
self.read_block(self._stream, '.*/teiHeader$', self.handle_header)
|
||||
self.close()
|
||||
|
||||
# Reset tag context.
|
||||
self._tag_context = {0: ()}
|
||||
|
||||
def handle_header(self, elt, context):
|
||||
# Set up some metadata!
|
||||
titles = elt.findall('titleStmt/title')
|
||||
if titles:
|
||||
self.title = '\n'.join(title.text.strip() for title in titles)
|
||||
|
||||
authors = elt.findall('titleStmt/author')
|
||||
if authors:
|
||||
self.author = '\n'.join(author.text.strip() for author in authors)
|
||||
|
||||
editors = elt.findall('titleStmt/editor')
|
||||
if editors:
|
||||
self.editor = '\n'.join(editor.text.strip() for editor in editors)
|
||||
|
||||
resps = elt.findall('titleStmt/respStmt')
|
||||
if resps:
|
||||
self.resps = '\n\n'.join(
|
||||
'\n'.join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
|
||||
)
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
if self._sent:
|
||||
return self.handle_sent(elt)
|
||||
else:
|
||||
return self.handle_word(elt)
|
||||
|
||||
def handle_word(self, elt):
|
||||
word = elt.text
|
||||
if not word:
|
||||
word = "" # fixes issue 337?
|
||||
if self._strip_space or self._stem:
|
||||
word = word.strip()
|
||||
if self._stem:
|
||||
word = elt.get('hw', word)
|
||||
if self._tag == 'c5':
|
||||
word = (word, elt.get('c5'))
|
||||
elif self._tag == 'pos':
|
||||
word = (word, elt.get('pos', elt.get('c5')))
|
||||
return word
|
||||
|
||||
def handle_sent(self, elt):
|
||||
sent = []
|
||||
for child in elt:
|
||||
if child.tag in ('mw', 'hi', 'corr', 'trunc'):
|
||||
sent += [self.handle_word(w) for w in child]
|
||||
elif child.tag in ('w', 'c'):
|
||||
sent.append(self.handle_word(child))
|
||||
elif child.tag not in self.tags_to_ignore:
|
||||
raise ValueError('Unexpected element %s' % child.tag)
|
||||
return BNCSentence(elt.attrib['n'], sent)
|
||||
@@ -0,0 +1,271 @@
|
||||
# Natural Language Toolkit: Penn Treebank Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
"""
|
||||
Corpus reader for corpora that consist of parenthesis-delineated parse trees.
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
from nltk.tree import Tree
|
||||
from nltk.tag import map_tag
|
||||
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
# we use [^\s()]+ instead of \S+? to avoid matching ()
|
||||
SORTTAGWRD = re.compile(r'\((\d+) ([^\s()]+) ([^\s()]+)\)')
|
||||
TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)')
|
||||
WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)')
|
||||
EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(')
|
||||
|
||||
|
||||
class BracketParseCorpusReader(SyntaxCorpusReader):
|
||||
"""
|
||||
Reader for corpora that consist of parenthesis-delineated parse trees,
|
||||
like those found in the "combined" section of the Penn Treebank,
|
||||
e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
comment_char=None,
|
||||
detect_blocks='unindented_paren',
|
||||
encoding='utf8',
|
||||
tagset=None,
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
:param comment_char: The character which can appear at the start of
|
||||
a line to indicate that the rest of the line is a comment.
|
||||
:param detect_blocks: The method that is used to find blocks
|
||||
in the corpus; can be 'unindented_paren' (every unindented
|
||||
parenthesis starts a new parse) or 'sexpr' (brackets are
|
||||
matched).
|
||||
:param tagset: The name of the tagset used by this corpus, to be used
|
||||
for normalizing or converting the POS tags returned by the
|
||||
tagged_...() methods.
|
||||
"""
|
||||
# FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
|
||||
# from CorpusReader?
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._comment_char = comment_char
|
||||
self._detect_blocks = detect_blocks
|
||||
self._tagset = tagset
|
||||
|
||||
def _read_block(self, stream):
|
||||
if self._detect_blocks == 'sexpr':
|
||||
return read_sexpr_block(stream, comment_char=self._comment_char)
|
||||
elif self._detect_blocks == 'blankline':
|
||||
return read_blankline_block(stream)
|
||||
elif self._detect_blocks == 'unindented_paren':
|
||||
# Tokens start with unindented left parens.
|
||||
toks = read_regexp_block(stream, start_re=r'^\(')
|
||||
# Strip any comments out of the tokens.
|
||||
if self._comment_char:
|
||||
toks = [
|
||||
re.sub('(?m)^%s.*' % re.escape(self._comment_char), '', tok)
|
||||
for tok in toks
|
||||
]
|
||||
return toks
|
||||
else:
|
||||
assert 0, 'bad block type'
|
||||
|
||||
def _normalize(self, t):
|
||||
# If there's an empty set of brackets surrounding the actual
|
||||
# parse, then strip them off.
|
||||
if EMPTY_BRACKETS.match(t):
|
||||
t = t.strip()[1:-1]
|
||||
# Replace leaves of the form (!), (,), with (! !), (, ,)
|
||||
t = re.sub(r"\((.)\)", r"(\1 \1)", t)
|
||||
# Replace leaves of the form (tag word root) with (tag word)
|
||||
t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
|
||||
return t
|
||||
|
||||
def _parse(self, t):
|
||||
try:
|
||||
return Tree.fromstring(self._normalize(t))
|
||||
|
||||
except ValueError as e:
|
||||
sys.stderr.write("Bad tree detected; trying to recover...\n")
|
||||
# Try to recover, if we can:
|
||||
if e.args == ('mismatched parens',):
|
||||
for n in range(1, 5):
|
||||
try:
|
||||
v = Tree(self._normalize(t + ')' * n))
|
||||
sys.stderr.write(
|
||||
" Recovered by adding %d close " "paren(s)\n" % n
|
||||
)
|
||||
return v
|
||||
except ValueError:
|
||||
pass
|
||||
# Try something else:
|
||||
sys.stderr.write(" Recovered by returning a flat parse.\n")
|
||||
# sys.stderr.write(' '.join(t.split())+'\n')
|
||||
return Tree('S', self._tag(t))
|
||||
|
||||
def _tag(self, t, tagset=None):
|
||||
tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
|
||||
if tagset and tagset != self._tagset:
|
||||
tagged_sent = [
|
||||
(w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent
|
||||
]
|
||||
return tagged_sent
|
||||
|
||||
def _word(self, t):
|
||||
return WORD.findall(self._normalize(t))
|
||||
|
||||
|
||||
class CategorizedBracketParseCorpusReader(
|
||||
CategorizedCorpusReader, BracketParseCorpusReader
|
||||
):
|
||||
"""
|
||||
A reader for parsed corpora whose documents are
|
||||
divided into categories based on their file identifiers.
|
||||
@author: Nathan Schneider <nschneid@cs.cmu.edu>
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""
|
||||
Initialize the corpus reader. Categorization arguments
|
||||
(C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
|
||||
the L{CategorizedCorpusReader constructor
|
||||
<CategorizedCorpusReader.__init__>}. The remaining arguments
|
||||
are passed to the L{BracketParseCorpusReader constructor
|
||||
<BracketParseCorpusReader.__init__>}.
|
||||
"""
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
BracketParseCorpusReader.__init__(self, *args, **kwargs)
|
||||
|
||||
def _resolve(self, fileids, categories):
|
||||
if fileids is not None and categories is not None:
|
||||
raise ValueError('Specify fileids or categories, not both')
|
||||
if categories is not None:
|
||||
return self.fileids(categories)
|
||||
else:
|
||||
return fileids
|
||||
|
||||
def raw(self, fileids=None, categories=None):
|
||||
return BracketParseCorpusReader.raw(self, self._resolve(fileids, categories))
|
||||
|
||||
def words(self, fileids=None, categories=None):
|
||||
return BracketParseCorpusReader.words(self, self._resolve(fileids, categories))
|
||||
|
||||
def sents(self, fileids=None, categories=None):
|
||||
return BracketParseCorpusReader.sents(self, self._resolve(fileids, categories))
|
||||
|
||||
def paras(self, fileids=None, categories=None):
|
||||
return BracketParseCorpusReader.paras(self, self._resolve(fileids, categories))
|
||||
|
||||
def tagged_words(self, fileids=None, categories=None, tagset=None):
|
||||
return BracketParseCorpusReader.tagged_words(
|
||||
self, self._resolve(fileids, categories), tagset
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, categories=None, tagset=None):
|
||||
return BracketParseCorpusReader.tagged_sents(
|
||||
self, self._resolve(fileids, categories), tagset
|
||||
)
|
||||
|
||||
def tagged_paras(self, fileids=None, categories=None, tagset=None):
|
||||
return BracketParseCorpusReader.tagged_paras(
|
||||
self, self._resolve(fileids, categories), tagset
|
||||
)
|
||||
|
||||
def parsed_words(self, fileids=None, categories=None):
|
||||
return BracketParseCorpusReader.parsed_words(
|
||||
self, self._resolve(fileids, categories)
|
||||
)
|
||||
|
||||
def parsed_sents(self, fileids=None, categories=None):
|
||||
return BracketParseCorpusReader.parsed_sents(
|
||||
self, self._resolve(fileids, categories)
|
||||
)
|
||||
|
||||
def parsed_paras(self, fileids=None, categories=None):
|
||||
return BracketParseCorpusReader.parsed_paras(
|
||||
self, self._resolve(fileids, categories)
|
||||
)
|
||||
|
||||
|
||||
class AlpinoCorpusReader(BracketParseCorpusReader):
|
||||
"""
|
||||
Reader for the Alpino Dutch Treebank.
|
||||
This corpus has a lexical breakdown structure embedded, as read by _parse
|
||||
Unfortunately this puts punctuation and some other words out of the sentence
|
||||
order in the xml element tree. This is no good for tag_ and word_
|
||||
_tag and _word will be overridden to use a non-default new parameter 'ordered'
|
||||
to the overridden _normalize function. The _parse function can then remain
|
||||
untouched.
|
||||
"""
|
||||
|
||||
def __init__(self, root, encoding='ISO-8859-1', tagset=None):
|
||||
BracketParseCorpusReader.__init__(
|
||||
self,
|
||||
root,
|
||||
'alpino\.xml',
|
||||
detect_blocks='blankline',
|
||||
encoding=encoding,
|
||||
tagset=tagset,
|
||||
)
|
||||
|
||||
def _normalize(self, t, ordered=False):
|
||||
"""Normalize the xml sentence element in t.
|
||||
The sentence elements <alpino_ds>, although embedded in a few overall
|
||||
xml elements, are seperated by blank lines. That's how the reader can
|
||||
deliver them one at a time.
|
||||
Each sentence has a few category subnodes that are of no use to us.
|
||||
The remaining word nodes may or may not appear in the proper order.
|
||||
Each word node has attributes, among which:
|
||||
- begin : the position of the word in the sentence
|
||||
- pos : Part of Speech: the Tag
|
||||
- word : the actual word
|
||||
The return value is a string with all xml elementes replaced by
|
||||
clauses: either a cat clause with nested clauses, or a word clause.
|
||||
The order of the bracket clauses closely follows the xml.
|
||||
If ordered == True, the word clauses include an order sequence number.
|
||||
If ordered == False, the word clauses only have pos and word parts.
|
||||
"""
|
||||
if t[:10] != "<alpino_ds":
|
||||
return ""
|
||||
# convert XML to sexpr notation
|
||||
t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t)
|
||||
if ordered:
|
||||
t = re.sub(
|
||||
r' <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>',
|
||||
r"(\1 \2 \3)",
|
||||
t,
|
||||
)
|
||||
else:
|
||||
t = re.sub(r' <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
|
||||
t = re.sub(r" </node>", r")", t)
|
||||
t = re.sub(r"<sentence>.*</sentence>", r"", t)
|
||||
t = re.sub(r"</?alpino_ds.*>", r"", t)
|
||||
return t
|
||||
|
||||
def _tag(self, t, tagset=None):
|
||||
tagged_sent = [
|
||||
(int(o), w, p)
|
||||
for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True))
|
||||
]
|
||||
tagged_sent.sort()
|
||||
if tagset and tagset != self._tagset:
|
||||
tagged_sent = [
|
||||
(w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent
|
||||
]
|
||||
else:
|
||||
tagged_sent = [(w, p) for (o, w, p) in tagged_sent]
|
||||
return tagged_sent
|
||||
|
||||
def _word(self, t):
|
||||
"""Return a correctly ordered list if words"""
|
||||
tagged_sent = self._tag(t)
|
||||
return [w for (w, p) in tagged_sent]
|
||||
@@ -0,0 +1,199 @@
|
||||
# Natural Language Toolkit: Categorized Sentences Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader structured for corpora that contain one instance on each row.
|
||||
This CorpusReader is specifically used for the Subjectivity Dataset and the
|
||||
Sentence Polarity Dataset.
|
||||
|
||||
- Subjectivity Dataset information -
|
||||
|
||||
Authors: Bo Pang and Lillian Lee.
|
||||
Url: http://www.cs.cornell.edu/people/pabo/movie-review-data
|
||||
|
||||
Distributed with permission.
|
||||
|
||||
Related papers:
|
||||
|
||||
- Bo Pang and Lillian Lee. "A Sentimental Education: Sentiment Analysis Using
|
||||
Subjectivity Summarization Based on Minimum Cuts". Proceedings of the ACL,
|
||||
2004.
|
||||
|
||||
- Sentence Polarity Dataset information -
|
||||
|
||||
Authors: Bo Pang and Lillian Lee.
|
||||
Url: http://www.cs.cornell.edu/people/pabo/movie-review-data
|
||||
|
||||
Related papers:
|
||||
|
||||
- Bo Pang and Lillian Lee. "Seeing stars: Exploiting class relationships for
|
||||
sentiment categorization with respect to rating scales". Proceedings of the
|
||||
ACL, 2005.
|
||||
"""
|
||||
from six import string_types
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.tokenize import *
|
||||
|
||||
|
||||
class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
|
||||
"""
|
||||
A reader for corpora in which each row represents a single instance, mainly
|
||||
a sentence. Istances are divided into categories based on their file identifiers
|
||||
(see CategorizedCorpusReader).
|
||||
Since many corpora allow rows that contain more than one sentence, it is
|
||||
possible to specify a sentence tokenizer to retrieve all sentences instead
|
||||
than all rows.
|
||||
|
||||
Examples using the Subjectivity Dataset:
|
||||
|
||||
>>> from nltk.corpus import subjectivity
|
||||
>>> subjectivity.sents()[23]
|
||||
['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits',
|
||||
'happened', 'off', 'screen', '.']
|
||||
>>> subjectivity.categories()
|
||||
['obj', 'subj']
|
||||
>>> subjectivity.words(categories='subj')
|
||||
['smart', 'and', 'alert', ',', 'thirteen', ...]
|
||||
|
||||
Examples using the Sentence Polarity Dataset:
|
||||
|
||||
>>> from nltk.corpus import sentence_polarity
|
||||
>>> sentence_polarity.sents()
|
||||
[['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish',
|
||||
'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find',
|
||||
'it', 'funny', '.'], ...]
|
||||
>>> sentence_polarity.categories()
|
||||
['neg', 'pos']
|
||||
"""
|
||||
|
||||
CorpusView = StreamBackedCorpusView
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
word_tokenizer=WhitespaceTokenizer(),
|
||||
sent_tokenizer=None,
|
||||
encoding='utf8',
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for the corpus.
|
||||
:param fileids: a list or regexp specifying the fileids in the corpus.
|
||||
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
|
||||
into words. Default: `WhitespaceTokenizer`
|
||||
:param sent_tokenizer: a tokenizer for breaking paragraphs into sentences.
|
||||
:param encoding: the encoding that should be used to read the corpus.
|
||||
:param kwargs: additional parameters passed to CategorizedCorpusReader.
|
||||
"""
|
||||
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
|
||||
def _resolve(self, fileids, categories):
|
||||
if fileids is not None and categories is not None:
|
||||
raise ValueError('Specify fileids or categories, not both')
|
||||
if categories is not None:
|
||||
return self.fileids(categories)
|
||||
else:
|
||||
return fileids
|
||||
|
||||
def raw(self, fileids=None, categories=None):
|
||||
"""
|
||||
:param fileids: a list or regexp specifying the fileids that have to be
|
||||
returned as a raw string.
|
||||
:param categories: a list specifying the categories whose files have to
|
||||
be returned as a raw string.
|
||||
:return: the given file(s) as a single string.
|
||||
:rtype: str
|
||||
"""
|
||||
fileids = self._resolve(fileids, categories)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
def readme(self):
|
||||
"""
|
||||
Return the contents of the corpus Readme.txt file.
|
||||
"""
|
||||
return self.open("README").read()
|
||||
|
||||
def sents(self, fileids=None, categories=None):
|
||||
"""
|
||||
Return all sentences in the corpus or in the specified file(s).
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
sentences have to be returned.
|
||||
:param categories: a list specifying the categories whose sentences have
|
||||
to be returned.
|
||||
:return: the given file(s) as a list of sentences.
|
||||
Each sentence is tokenized using the specified word_tokenizer.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
fileids = self._resolve(fileids, categories)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None, categories=None):
|
||||
"""
|
||||
Return all words and punctuation symbols in the corpus or in the specified
|
||||
file(s).
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
words have to be returned.
|
||||
:param categories: a list specifying the categories whose words have to
|
||||
be returned.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
fileids = self._resolve(fileids, categories)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
sents = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
continue
|
||||
if self._sent_tokenizer:
|
||||
sents.extend(
|
||||
[
|
||||
self._word_tokenizer.tokenize(sent)
|
||||
for sent in self._sent_tokenizer.tokenize(line)
|
||||
]
|
||||
)
|
||||
else:
|
||||
sents.append(self._word_tokenizer.tokenize(line))
|
||||
return sents
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for sent in self._read_sent_block(stream):
|
||||
words.extend(sent)
|
||||
return words
|
||||
171
venv/lib/python3.7/site-packages/nltk/corpus/reader/chasen.py
Normal file
171
venv/lib/python3.7/site-packages/nltk/corpus/reader/chasen.py
Normal file
@@ -0,0 +1,171 @@
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Masato Hagiwara <hagisan@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk.corpus.reader import util
|
||||
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
|
||||
class ChasenCorpusReader(CorpusReader):
|
||||
def __init__(self, root, fileids, encoding='utf8', sent_splitter=None):
|
||||
self._sent_splitter = sent_splitter
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
|
||||
def raw(self, fileids=None):
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
def words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_paras(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class ChasenCorpusView(StreamBackedCorpusView):
|
||||
"""
|
||||
A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``,
|
||||
but this'll use fixed sets of word and sentence tokenizer.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
corpus_file,
|
||||
encoding,
|
||||
tagged,
|
||||
group_by_sent,
|
||||
group_by_para,
|
||||
sent_splitter=None,
|
||||
):
|
||||
self._tagged = tagged
|
||||
self._group_by_sent = group_by_sent
|
||||
self._group_by_para = group_by_para
|
||||
self._sent_splitter = sent_splitter
|
||||
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
||||
|
||||
def read_block(self, stream):
|
||||
"""Reads one paragraph at a time."""
|
||||
block = []
|
||||
for para_str in read_regexp_block(stream, r".", r"^EOS\n"):
|
||||
|
||||
para = []
|
||||
|
||||
sent = []
|
||||
for line in para_str.splitlines():
|
||||
|
||||
_eos = line.strip() == 'EOS'
|
||||
_cells = line.split('\t')
|
||||
w = (_cells[0], '\t'.join(_cells[1:]))
|
||||
if not _eos:
|
||||
sent.append(w)
|
||||
|
||||
if _eos or (self._sent_splitter and self._sent_splitter(w)):
|
||||
if not self._tagged:
|
||||
sent = [w for (w, t) in sent]
|
||||
if self._group_by_sent:
|
||||
para.append(sent)
|
||||
else:
|
||||
para.extend(sent)
|
||||
sent = []
|
||||
|
||||
if len(sent) > 0:
|
||||
if not self._tagged:
|
||||
sent = [w for (w, t) in sent]
|
||||
|
||||
if self._group_by_sent:
|
||||
para.append(sent)
|
||||
else:
|
||||
para.extend(sent)
|
||||
|
||||
if self._group_by_para:
|
||||
block.append(para)
|
||||
else:
|
||||
block.extend(para)
|
||||
|
||||
return block
|
||||
|
||||
|
||||
def demo():
|
||||
|
||||
import nltk
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
|
||||
jeita = LazyCorpusLoader('jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
|
||||
print('/'.join(jeita.words()[22100:22140]))
|
||||
|
||||
print(
|
||||
'\nEOS\n'.join(
|
||||
'\n'.join("%s/%s" % (w[0], w[1].split('\t')[2]) for w in sent)
|
||||
for sent in jeita.tagged_sents()[2170:2173]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test():
|
||||
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
|
||||
jeita = LazyCorpusLoader('jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
|
||||
|
||||
assert isinstance(jeita.tagged_words()[0][1], string_types)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
demo()
|
||||
test()
|
||||
633
venv/lib/python3.7/site-packages/nltk/corpus/reader/childes.py
Normal file
633
venv/lib/python3.7/site-packages/nltk/corpus/reader/childes.py
Normal file
@@ -0,0 +1,633 @@
|
||||
# CHILDES XML Corpus Reader
|
||||
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Tomonori Nagano <tnagano@gc.cuny.edu>
|
||||
# Alexis Dimitriadis <A.Dimitriadis@uu.nl>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Corpus reader for the XML version of the CHILDES corpus.
|
||||
"""
|
||||
from __future__ import print_function, division
|
||||
|
||||
__docformat__ = 'epytext en'
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from six import string_types
|
||||
|
||||
from nltk.util import flatten, LazyMap, LazyConcatenation
|
||||
|
||||
from nltk.corpus.reader.util import concat
|
||||
from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree
|
||||
|
||||
# to resolve the namespace issue
|
||||
NS = 'http://www.talkbank.org/ns/talkbank'
|
||||
|
||||
|
||||
class CHILDESCorpusReader(XMLCorpusReader):
|
||||
"""
|
||||
Corpus reader for the XML version of the CHILDES corpus.
|
||||
The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
|
||||
version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
|
||||
Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
|
||||
(``nltk_data/corpora/CHILDES/``).
|
||||
|
||||
For access to the file text use the usual nltk functions,
|
||||
``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, lazy=True):
|
||||
XMLCorpusReader.__init__(self, root, fileids)
|
||||
self._lazy = lazy
|
||||
|
||||
def words(
|
||||
self,
|
||||
fileids=None,
|
||||
speaker='ALL',
|
||||
stem=False,
|
||||
relation=False,
|
||||
strip_space=True,
|
||||
replace=False,
|
||||
):
|
||||
"""
|
||||
:return: the given file(s) as a list of words
|
||||
:rtype: list(str)
|
||||
|
||||
:param speaker: If specified, select specific speaker(s) defined
|
||||
in the corpus. Default is 'ALL' (all participants). Common choices
|
||||
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
||||
researchers)
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
:param relation: If true, then return tuples of (stem, index,
|
||||
dependent_index)
|
||||
:param strip_space: If true, then strip trailing spaces from word
|
||||
tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param replace: If true, then use the replaced (intended) word instead
|
||||
of the original word (e.g., 'wat' will be replaced with 'watch')
|
||||
"""
|
||||
sent = None
|
||||
pos = False
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
|
||||
get_words = lambda fileid: self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
||||
|
||||
def tagged_words(
|
||||
self,
|
||||
fileids=None,
|
||||
speaker='ALL',
|
||||
stem=False,
|
||||
relation=False,
|
||||
strip_space=True,
|
||||
replace=False,
|
||||
):
|
||||
"""
|
||||
:return: the given file(s) as a list of tagged
|
||||
words and punctuation symbols, encoded as tuples
|
||||
``(word,tag)``.
|
||||
:rtype: list(tuple(str,str))
|
||||
|
||||
:param speaker: If specified, select specific speaker(s) defined
|
||||
in the corpus. Default is 'ALL' (all participants). Common choices
|
||||
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
||||
researchers)
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
:param relation: If true, then return tuples of (stem, index,
|
||||
dependent_index)
|
||||
:param strip_space: If true, then strip trailing spaces from word
|
||||
tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param replace: If true, then use the replaced (intended) word instead
|
||||
of the original word (e.g., 'wat' will be replaced with 'watch')
|
||||
"""
|
||||
sent = None
|
||||
pos = True
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
|
||||
get_words = lambda fileid: self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
||||
|
||||
def sents(
|
||||
self,
|
||||
fileids=None,
|
||||
speaker='ALL',
|
||||
stem=False,
|
||||
relation=None,
|
||||
strip_space=True,
|
||||
replace=False,
|
||||
):
|
||||
"""
|
||||
:return: the given file(s) as a list of sentences or utterances, each
|
||||
encoded as a list of word strings.
|
||||
:rtype: list(list(str))
|
||||
|
||||
:param speaker: If specified, select specific speaker(s) defined
|
||||
in the corpus. Default is 'ALL' (all participants). Common choices
|
||||
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
||||
researchers)
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
|
||||
If there is manually-annotated relation info, it will return
|
||||
tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
|
||||
:param strip_space: If true, then strip trailing spaces from word
|
||||
tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param replace: If true, then use the replaced (intended) word instead
|
||||
of the original word (e.g., 'wat' will be replaced with 'watch')
|
||||
"""
|
||||
sent = True
|
||||
pos = False
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
|
||||
get_words = lambda fileid: self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
||||
|
||||
def tagged_sents(
|
||||
self,
|
||||
fileids=None,
|
||||
speaker='ALL',
|
||||
stem=False,
|
||||
relation=None,
|
||||
strip_space=True,
|
||||
replace=False,
|
||||
):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences, each encoded as a list of ``(word,tag)`` tuples.
|
||||
:rtype: list(list(tuple(str,str)))
|
||||
|
||||
:param speaker: If specified, select specific speaker(s) defined
|
||||
in the corpus. Default is 'ALL' (all participants). Common choices
|
||||
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
||||
researchers)
|
||||
:param stem: If true, then use word stems instead of word strings.
|
||||
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
|
||||
If there is manually-annotated relation info, it will return
|
||||
tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
|
||||
:param strip_space: If true, then strip trailing spaces from word
|
||||
tokens. Otherwise, leave the spaces on the tokens.
|
||||
:param replace: If true, then use the replaced (intended) word instead
|
||||
of the original word (e.g., 'wat' will be replaced with 'watch')
|
||||
"""
|
||||
sent = True
|
||||
pos = True
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
|
||||
get_words = lambda fileid: self._get_words(
|
||||
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
)
|
||||
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
||||
|
||||
def corpus(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a dict of ``(corpus_property_key, value)``
|
||||
:rtype: list(dict)
|
||||
"""
|
||||
if not self._lazy:
|
||||
return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)]
|
||||
return LazyMap(self._get_corpus, self.abspaths(fileids))
|
||||
|
||||
def _get_corpus(self, fileid):
|
||||
results = dict()
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
for key, value in xmldoc.items():
|
||||
results[key] = value
|
||||
return results
|
||||
|
||||
def participants(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a dict of
|
||||
``(participant_property_key, value)``
|
||||
:rtype: list(dict)
|
||||
"""
|
||||
if not self._lazy:
|
||||
return [self._get_participants(fileid) for fileid in self.abspaths(fileids)]
|
||||
return LazyMap(self._get_participants, self.abspaths(fileids))
|
||||
|
||||
def _get_participants(self, fileid):
|
||||
# multidimensional dicts
|
||||
def dictOfDicts():
|
||||
return defaultdict(dictOfDicts)
|
||||
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
# getting participants' data
|
||||
pat = dictOfDicts()
|
||||
for participant in xmldoc.findall(
|
||||
'.//{%s}Participants/{%s}participant' % (NS, NS)
|
||||
):
|
||||
for (key, value) in participant.items():
|
||||
pat[participant.get('id')][key] = value
|
||||
return pat
|
||||
|
||||
def age(self, fileids=None, speaker='CHI', month=False):
|
||||
"""
|
||||
:return: the given file(s) as string or int
|
||||
:rtype: list or int
|
||||
|
||||
:param month: If true, return months instead of year-month-date
|
||||
"""
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._get_age(fileid, speaker, month)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
get_age = lambda fileid: self._get_age(fileid, speaker, month)
|
||||
return LazyMap(get_age, self.abspaths(fileids))
|
||||
|
||||
def _get_age(self, fileid, speaker, month):
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS, NS)):
|
||||
try:
|
||||
if pat.get('id') == speaker:
|
||||
age = pat.get('age')
|
||||
if month:
|
||||
age = self.convert_age(age)
|
||||
return age
|
||||
# some files don't have age data
|
||||
except (TypeError, AttributeError) as e:
|
||||
return None
|
||||
|
||||
def convert_age(self, age_year):
|
||||
"Caclculate age in months from a string in CHILDES format"
|
||||
m = re.match("P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
|
||||
age_month = int(m.group(1)) * 12 + int(m.group(2))
|
||||
try:
|
||||
if int(m.group(3)) > 15:
|
||||
age_month += 1
|
||||
# some corpora don't have age information?
|
||||
except ValueError as e:
|
||||
pass
|
||||
return age_month
|
||||
|
||||
def MLU(self, fileids=None, speaker='CHI'):
|
||||
"""
|
||||
:return: the given file(s) as a floating number
|
||||
:rtype: list(float)
|
||||
"""
|
||||
if not self._lazy:
|
||||
return [
|
||||
self._getMLU(fileid, speaker=speaker)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
|
||||
return LazyMap(get_MLU, self.abspaths(fileids))
|
||||
|
||||
def _getMLU(self, fileid, speaker):
|
||||
sents = self._get_words(
|
||||
fileid,
|
||||
speaker=speaker,
|
||||
sent=True,
|
||||
stem=True,
|
||||
relation=False,
|
||||
pos=True,
|
||||
strip_space=True,
|
||||
replace=True,
|
||||
)
|
||||
results = []
|
||||
lastSent = []
|
||||
numFillers = 0
|
||||
sentDiscount = 0
|
||||
for sent in sents:
|
||||
posList = [pos for (word, pos) in sent]
|
||||
# if any part of the sentence is intelligible
|
||||
if any(pos == 'unk' for pos in posList):
|
||||
continue
|
||||
# if the sentence is null
|
||||
elif sent == []:
|
||||
continue
|
||||
# if the sentence is the same as the last sent
|
||||
elif sent == lastSent:
|
||||
continue
|
||||
else:
|
||||
results.append([word for (word, pos) in sent])
|
||||
# count number of fillers
|
||||
if len(set(['co', None]).intersection(posList)) > 0:
|
||||
numFillers += posList.count('co')
|
||||
numFillers += posList.count(None)
|
||||
sentDiscount += 1
|
||||
lastSent = sent
|
||||
try:
|
||||
thisWordList = flatten(results)
|
||||
# count number of morphemes
|
||||
# (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
|
||||
numWords = (
|
||||
len(flatten([word.split('-') for word in thisWordList])) - numFillers
|
||||
)
|
||||
numSents = len(results) - sentDiscount
|
||||
mlu = numWords / numSents
|
||||
except ZeroDivisionError:
|
||||
mlu = 0
|
||||
# return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
|
||||
return mlu
|
||||
|
||||
def _get_words(
|
||||
self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||||
):
|
||||
if (
|
||||
isinstance(speaker, string_types) and speaker != 'ALL'
|
||||
): # ensure we have a list of speakers
|
||||
speaker = [speaker]
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
# processing each xml doc
|
||||
results = []
|
||||
for xmlsent in xmldoc.findall('.//{%s}u' % NS):
|
||||
sents = []
|
||||
# select speakers
|
||||
if speaker == 'ALL' or xmlsent.get('who') in speaker:
|
||||
for xmlword in xmlsent.findall('.//{%s}w' % NS):
|
||||
infl = None
|
||||
suffixStem = None
|
||||
suffixTag = None
|
||||
# getting replaced words
|
||||
if replace and xmlsent.find('.//{%s}w/{%s}replacement' % (NS, NS)):
|
||||
xmlword = xmlsent.find(
|
||||
'.//{%s}w/{%s}replacement/{%s}w' % (NS, NS, NS)
|
||||
)
|
||||
elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS)):
|
||||
xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS))
|
||||
# get text
|
||||
if xmlword.text:
|
||||
word = xmlword.text
|
||||
else:
|
||||
word = ''
|
||||
# strip tailing space
|
||||
if strip_space:
|
||||
word = word.strip()
|
||||
# stem
|
||||
if relation or stem:
|
||||
try:
|
||||
xmlstem = xmlword.find('.//{%s}stem' % NS)
|
||||
word = xmlstem.text
|
||||
except AttributeError as e:
|
||||
pass
|
||||
# if there is an inflection
|
||||
try:
|
||||
xmlinfl = xmlword.find(
|
||||
'.//{%s}mor/{%s}mw/{%s}mk' % (NS, NS, NS)
|
||||
)
|
||||
word += '-' + xmlinfl.text
|
||||
except:
|
||||
pass
|
||||
# if there is a suffix
|
||||
try:
|
||||
xmlsuffix = xmlword.find(
|
||||
'.//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem'
|
||||
% (NS, NS, NS, NS)
|
||||
)
|
||||
suffixStem = xmlsuffix.text
|
||||
except AttributeError:
|
||||
suffixStem = ""
|
||||
if suffixStem:
|
||||
word += "~" + suffixStem
|
||||
# pos
|
||||
if relation or pos:
|
||||
try:
|
||||
xmlpos = xmlword.findall(".//{%s}c" % NS)
|
||||
xmlpos2 = xmlword.findall(".//{%s}s" % NS)
|
||||
if xmlpos2 != []:
|
||||
tag = xmlpos[0].text + ":" + xmlpos2[0].text
|
||||
else:
|
||||
tag = xmlpos[0].text
|
||||
except (AttributeError, IndexError) as e:
|
||||
tag = ""
|
||||
try:
|
||||
xmlsuffixpos = xmlword.findall(
|
||||
'.//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c'
|
||||
% (NS, NS, NS, NS, NS)
|
||||
)
|
||||
xmlsuffixpos2 = xmlword.findall(
|
||||
'.//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s'
|
||||
% (NS, NS, NS, NS, NS)
|
||||
)
|
||||
if xmlsuffixpos2:
|
||||
suffixTag = (
|
||||
xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
|
||||
)
|
||||
else:
|
||||
suffixTag = xmlsuffixpos[0].text
|
||||
except:
|
||||
pass
|
||||
if suffixTag:
|
||||
tag += "~" + suffixTag
|
||||
word = (word, tag)
|
||||
# relational
|
||||
# the gold standard is stored in
|
||||
# <mor></mor><mor type="trn"><gra type="grt">
|
||||
if relation == True:
|
||||
for xmlstem_rel in xmlword.findall(
|
||||
'.//{%s}mor/{%s}gra' % (NS, NS)
|
||||
):
|
||||
if not xmlstem_rel.get('type') == 'grt':
|
||||
word = (
|
||||
word[0],
|
||||
word[1],
|
||||
xmlstem_rel.get('index')
|
||||
+ "|"
|
||||
+ xmlstem_rel.get('head')
|
||||
+ "|"
|
||||
+ xmlstem_rel.get('relation'),
|
||||
)
|
||||
else:
|
||||
word = (
|
||||
word[0],
|
||||
word[1],
|
||||
word[2],
|
||||
word[0],
|
||||
word[1],
|
||||
xmlstem_rel.get('index')
|
||||
+ "|"
|
||||
+ xmlstem_rel.get('head')
|
||||
+ "|"
|
||||
+ xmlstem_rel.get('relation'),
|
||||
)
|
||||
try:
|
||||
for xmlpost_rel in xmlword.findall(
|
||||
'.//{%s}mor/{%s}mor-post/{%s}gra' % (NS, NS, NS)
|
||||
):
|
||||
if not xmlpost_rel.get('type') == 'grt':
|
||||
suffixStem = (
|
||||
suffixStem[0],
|
||||
suffixStem[1],
|
||||
xmlpost_rel.get('index')
|
||||
+ "|"
|
||||
+ xmlpost_rel.get('head')
|
||||
+ "|"
|
||||
+ xmlpost_rel.get('relation'),
|
||||
)
|
||||
else:
|
||||
suffixStem = (
|
||||
suffixStem[0],
|
||||
suffixStem[1],
|
||||
suffixStem[2],
|
||||
suffixStem[0],
|
||||
suffixStem[1],
|
||||
xmlpost_rel.get('index')
|
||||
+ "|"
|
||||
+ xmlpost_rel.get('head')
|
||||
+ "|"
|
||||
+ xmlpost_rel.get('relation'),
|
||||
)
|
||||
except:
|
||||
pass
|
||||
sents.append(word)
|
||||
if sent or relation:
|
||||
results.append(sents)
|
||||
else:
|
||||
results.extend(sents)
|
||||
return LazyMap(lambda x: x, results)
|
||||
|
||||
# Ready-to-use browser opener
|
||||
|
||||
"""
|
||||
The base URL for viewing files on the childes website. This
|
||||
shouldn't need to be changed, unless CHILDES changes the configuration
|
||||
of their server or unless the user sets up their own corpus webserver.
|
||||
"""
|
||||
childes_url_base = r'https://childes.talkbank.org/browser/index.php?url='
|
||||
|
||||
def webview_file(self, fileid, urlbase=None):
|
||||
"""Map a corpus file to its web version on the CHILDES website,
|
||||
and open it in a web browser.
|
||||
|
||||
The complete URL to be used is:
|
||||
childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')
|
||||
|
||||
If no urlbase is passed, we try to calculate it. This
|
||||
requires that the childes corpus was set up to mirror the
|
||||
folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
|
||||
nltk_data/corpora/childes/Eng-USA/Cornell/??? or
|
||||
nltk_data/corpora/childes/Romance/Spanish/Aguirre/???
|
||||
|
||||
The function first looks (as a special case) if "Eng-USA" is
|
||||
on the path consisting of <corpus root>+fileid; then if
|
||||
"childes", possibly followed by "data-xml", appears. If neither
|
||||
one is found, we use the unmodified fileid and hope for the best.
|
||||
If this is not right, specify urlbase explicitly, e.g., if the
|
||||
corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
|
||||
"""
|
||||
|
||||
import webbrowser
|
||||
|
||||
if urlbase:
|
||||
path = urlbase + "/" + fileid
|
||||
else:
|
||||
full = self.root + "/" + fileid
|
||||
full = re.sub(r'\\', '/', full)
|
||||
if '/childes/' in full.lower():
|
||||
# Discard /data-xml/ if present
|
||||
path = re.findall(r'(?i)/childes(?:/data-xml)?/(.*)\.xml', full)[0]
|
||||
elif 'eng-usa' in full.lower():
|
||||
path = 'Eng-USA/' + re.findall(r'/(?i)Eng-USA/(.*)\.xml', full)[0]
|
||||
else:
|
||||
path = fileid
|
||||
|
||||
# Strip ".xml" and add ".cha", as necessary:
|
||||
if path.endswith('.xml'):
|
||||
path = path[:-4]
|
||||
|
||||
if not path.endswith('.cha'):
|
||||
path = path + '.cha'
|
||||
|
||||
url = self.childes_url_base + path
|
||||
|
||||
webbrowser.open_new_tab(url)
|
||||
print("Opening in browser:", url)
|
||||
# Pausing is a good idea, but it's up to the user...
|
||||
# raw_input("Hit Return to continue")
|
||||
|
||||
|
||||
def demo(corpus_root=None):
|
||||
"""
|
||||
The CHILDES corpus should be manually downloaded and saved
|
||||
to ``[NLTK_Data_Dir]/corpora/childes/``
|
||||
"""
|
||||
if not corpus_root:
|
||||
from nltk.data import find
|
||||
|
||||
corpus_root = find('corpora/childes/data-xml/Eng-USA/')
|
||||
|
||||
try:
|
||||
childes = CHILDESCorpusReader(corpus_root, '.*.xml')
|
||||
# describe all corpus
|
||||
for file in childes.fileids()[:5]:
|
||||
corpus = ''
|
||||
corpus_id = ''
|
||||
for (key, value) in childes.corpus(file)[0].items():
|
||||
if key == "Corpus":
|
||||
corpus = value
|
||||
if key == "Id":
|
||||
corpus_id = value
|
||||
print('Reading', corpus, corpus_id, ' .....')
|
||||
print("words:", childes.words(file)[:7], "...")
|
||||
print(
|
||||
"words with replaced words:",
|
||||
childes.words(file, replace=True)[:7],
|
||||
" ...",
|
||||
)
|
||||
print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
|
||||
print("words (only MOT):", childes.words(file, speaker='MOT')[:7], "...")
|
||||
print("words (only CHI):", childes.words(file, speaker='CHI')[:7], "...")
|
||||
print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
|
||||
print(
|
||||
"words with relations and pos-tag:",
|
||||
childes.words(file, relation=True)[:5],
|
||||
" ...",
|
||||
)
|
||||
print("sentence:", childes.sents(file)[:2], " ...")
|
||||
for (participant, values) in childes.participants(file)[0].items():
|
||||
for (key, value) in values.items():
|
||||
print("\tparticipant", participant, key, ":", value)
|
||||
print("num of sent:", len(childes.sents(file)))
|
||||
print("num of morphemes:", len(childes.words(file, stem=True)))
|
||||
print("age:", childes.age(file))
|
||||
print("age in month:", childes.age(file, month=True))
|
||||
print("MLU:", childes.MLU(file))
|
||||
print()
|
||||
|
||||
except LookupError as e:
|
||||
print(
|
||||
"""The CHILDES corpus, or the parts you need, should be manually
|
||||
downloaded from https://childes.talkbank.org/data-xml/ and saved at
|
||||
[NLTK_Data_Dir]/corpora/childes/
|
||||
Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
|
||||
demo('/path/to/childes/data-xml/Eng-USA/")
|
||||
"""
|
||||
)
|
||||
# corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
|
||||
# corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
|
||||
##this fails
|
||||
# childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo()
|
||||
285
venv/lib/python3.7/site-packages/nltk/corpus/reader/chunked.py
Normal file
285
venv/lib/python3.7/site-packages/nltk/corpus/reader/chunked.py
Normal file
@@ -0,0 +1,285 @@
|
||||
# Natural Language Toolkit: Chunked Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A reader for corpora that contain chunked (and optionally tagged)
|
||||
documents.
|
||||
"""
|
||||
|
||||
import os.path, codecs
|
||||
|
||||
from six import string_types
|
||||
|
||||
import nltk
|
||||
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
|
||||
from nltk.tree import Tree
|
||||
from nltk.tokenize import *
|
||||
from nltk.chunk import tagstr2tree
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
|
||||
class ChunkedCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for chunked (and optionally tagged) corpora. Paragraphs
|
||||
are split using a block reader. They are then tokenized into
|
||||
sentences using a sentence tokenizer. Finally, these sentences
|
||||
are parsed into chunk trees using a string-to-chunktree conversion
|
||||
function. Each of these steps can be performed using a default
|
||||
function or a custom function. By default, paragraphs are split
|
||||
on blank lines; sentences are listed one per line; and sentences
|
||||
are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
extension='',
|
||||
str2chunktree=tagstr2tree,
|
||||
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
|
||||
para_block_reader=read_blankline_block,
|
||||
encoding='utf8',
|
||||
tagset=None,
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
"""
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset)
|
||||
"""Arguments for corpus views generated by this corpus: a tuple
|
||||
(str2chunktree, sent_tokenizer, para_block_tokenizer)"""
|
||||
|
||||
def raw(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a single string.
|
||||
:rtype: str
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of words
|
||||
and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences or utterances, each encoded as a list of word
|
||||
strings.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
paragraphs, each encoded as a list of sentences, which are
|
||||
in turn encoded as lists of word strings.
|
||||
:rtype: list(list(list(str)))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of tagged
|
||||
words and punctuation symbols, encoded as tuples
|
||||
``(word,tag)``.
|
||||
:rtype: list(tuple(str,str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences, each encoded as a list of ``(word,tag)`` tuples.
|
||||
|
||||
:rtype: list(list(tuple(str,str)))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_paras(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
paragraphs, each encoded as a list of sentences, which are
|
||||
in turn encoded as lists of ``(word,tag)`` tuples.
|
||||
:rtype: list(list(list(tuple(str,str))))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def chunked_words(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of tagged
|
||||
words and chunks. Words are encoded as ``(word, tag)``
|
||||
tuples (if the corpus has tags) or word strings (if the
|
||||
corpus has no tags). Chunks are encoded as depth-one
|
||||
trees over ``(word,tag)`` tuples or word strings.
|
||||
:rtype: list(tuple(str,str) and Tree)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def chunked_sents(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences, each encoded as a shallow Tree. The leaves
|
||||
of these trees are encoded as ``(word, tag)`` tuples (if
|
||||
the corpus has tags) or word strings (if the corpus has no
|
||||
tags).
|
||||
:rtype: list(Tree)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def chunked_paras(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
paragraphs, each encoded as a list of sentences, which are
|
||||
in turn encoded as a shallow Tree. The leaves of these
|
||||
trees are encoded as ``(word, tag)`` tuples (if the corpus
|
||||
has tags) or word strings (if the corpus has no tags).
|
||||
:rtype: list(list(Tree))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
ChunkedCorpusView(
|
||||
f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset
|
||||
)
|
||||
for (f, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_block(self, stream):
|
||||
return [tagstr2tree(t) for t in read_blankline_block(stream)]
|
||||
|
||||
|
||||
class ChunkedCorpusView(StreamBackedCorpusView):
|
||||
def __init__(
|
||||
self,
|
||||
fileid,
|
||||
encoding,
|
||||
tagged,
|
||||
group_by_sent,
|
||||
group_by_para,
|
||||
chunked,
|
||||
str2chunktree,
|
||||
sent_tokenizer,
|
||||
para_block_reader,
|
||||
source_tagset=None,
|
||||
target_tagset=None,
|
||||
):
|
||||
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
|
||||
self._tagged = tagged
|
||||
self._group_by_sent = group_by_sent
|
||||
self._group_by_para = group_by_para
|
||||
self._chunked = chunked
|
||||
self._str2chunktree = str2chunktree
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._para_block_reader = para_block_reader
|
||||
self._source_tagset = source_tagset
|
||||
self._target_tagset = target_tagset
|
||||
|
||||
def read_block(self, stream):
|
||||
block = []
|
||||
for para_str in self._para_block_reader(stream):
|
||||
para = []
|
||||
for sent_str in self._sent_tokenizer.tokenize(para_str):
|
||||
sent = self._str2chunktree(
|
||||
sent_str,
|
||||
source_tagset=self._source_tagset,
|
||||
target_tagset=self._target_tagset,
|
||||
)
|
||||
|
||||
# If requested, throw away the tags.
|
||||
if not self._tagged:
|
||||
sent = self._untag(sent)
|
||||
|
||||
# If requested, throw away the chunks.
|
||||
if not self._chunked:
|
||||
sent = sent.leaves()
|
||||
|
||||
# Add the sentence to `para`.
|
||||
if self._group_by_sent:
|
||||
para.append(sent)
|
||||
else:
|
||||
para.extend(sent)
|
||||
|
||||
# Add the paragraph to `block`.
|
||||
if self._group_by_para:
|
||||
block.append(para)
|
||||
else:
|
||||
block.extend(para)
|
||||
|
||||
# Return the block
|
||||
return block
|
||||
|
||||
def _untag(self, tree):
|
||||
for i, child in enumerate(tree):
|
||||
if isinstance(child, Tree):
|
||||
self._untag(child)
|
||||
elif isinstance(child, tuple):
|
||||
tree[i] = child[0]
|
||||
else:
|
||||
raise ValueError('expected child to be Tree or tuple')
|
||||
return tree
|
||||
@@ -0,0 +1,99 @@
|
||||
# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
|
||||
ftp://ftp.cs.cmu.edu/project/speech/dict/
|
||||
Copyright 1998 Carnegie Mellon University
|
||||
|
||||
File Format: Each line consists of an uppercased word, a counter
|
||||
(for alternative pronunciations), and a transcription. Vowels are
|
||||
marked for stress (1=primary, 2=secondary, 0=no stress). E.g.:
|
||||
NATURAL 1 N AE1 CH ER0 AH0 L
|
||||
|
||||
The dictionary contains 127069 entries. Of these, 119400 words are assigned
|
||||
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
|
||||
three or more pronunciations. Many of these are fast-speech variants.
|
||||
|
||||
Phonemes: There are 39 phonemes, as shown below:
|
||||
|
||||
Phoneme Example Translation Phoneme Example Translation
|
||||
------- ------- ----------- ------- ------- -----------
|
||||
AA odd AA D AE at AE T
|
||||
AH hut HH AH T AO ought AO T
|
||||
AW cow K AW AY hide HH AY D
|
||||
B be B IY CH cheese CH IY Z
|
||||
D dee D IY DH thee DH IY
|
||||
EH Ed EH D ER hurt HH ER T
|
||||
EY ate EY T F fee F IY
|
||||
G green G R IY N HH he HH IY
|
||||
IH it IH T IY eat IY T
|
||||
JH gee JH IY K key K IY
|
||||
L lee L IY M me M IY
|
||||
N knee N IY NG ping P IH NG
|
||||
OW oat OW T OY toy T OY
|
||||
P pee P IY R read R IY D
|
||||
S sea S IY SH she SH IY
|
||||
T tea T IY TH theta TH EY T AH
|
||||
UH hood HH UH D UW two T UW
|
||||
V vee V IY W we W IY
|
||||
Y yield Y IY L D Z zee Z IY
|
||||
ZH seizure S IY ZH ER
|
||||
"""
|
||||
|
||||
from nltk import compat
|
||||
from nltk.util import Index
|
||||
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
|
||||
class CMUDictCorpusReader(CorpusReader):
|
||||
def entries(self):
|
||||
"""
|
||||
:return: the cmudict lexicon as a list of entries
|
||||
containing (word, transcriptions) tuples.
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, read_cmudict_block, encoding=enc)
|
||||
for fileid, enc in self.abspaths(None, True)
|
||||
]
|
||||
)
|
||||
|
||||
def raw(self):
|
||||
"""
|
||||
:return: the cmudict lexicon as a raw string.
|
||||
"""
|
||||
fileids = self._fileids
|
||||
if isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
def words(self):
|
||||
"""
|
||||
:return: a list of all words defined in the cmudict lexicon.
|
||||
"""
|
||||
return [word.lower() for (word, _) in self.entries()]
|
||||
|
||||
def dict(self):
|
||||
"""
|
||||
:return: the cmudict lexicon as a dictionary, whose keys are
|
||||
lowercase words and whose values are lists of pronunciations.
|
||||
"""
|
||||
return dict(Index(self.entries()))
|
||||
|
||||
|
||||
def read_cmudict_block(stream):
|
||||
entries = []
|
||||
while len(entries) < 100: # Read 100 at a time.
|
||||
line = stream.readline()
|
||||
if line == '':
|
||||
return entries # end of file.
|
||||
pieces = line.split()
|
||||
entries.append((pieces[0].lower(), pieces[2:]))
|
||||
return entries
|
||||
@@ -0,0 +1,328 @@
|
||||
# Natural Language Toolkit: Comparative Sentence Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader for the Comparative Sentence Dataset.
|
||||
|
||||
- Comparative Sentence Dataset information -
|
||||
|
||||
Annotated by: Nitin Jindal and Bing Liu, 2006.
|
||||
Department of Computer Sicence
|
||||
University of Illinois at Chicago
|
||||
|
||||
Contact: Nitin Jindal, njindal@cs.uic.edu
|
||||
Bing Liu, liub@cs.uic.edu (http://www.cs.uic.edu/~liub)
|
||||
|
||||
Distributed with permission.
|
||||
|
||||
Related papers:
|
||||
|
||||
- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
|
||||
Proceedings of the ACM SIGIR International Conference on Information Retrieval
|
||||
(SIGIR-06), 2006.
|
||||
|
||||
- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
|
||||
Proceedings of Twenty First National Conference on Artificial Intelligence
|
||||
(AAAI-2006), 2006.
|
||||
|
||||
- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
|
||||
Proceedings of the 22nd International Conference on Computational Linguistics
|
||||
(Coling-2008), Manchester, 18-22 August, 2008.
|
||||
"""
|
||||
import re
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.tokenize import *
|
||||
|
||||
# Regular expressions for dataset components
|
||||
STARS = re.compile(r'^\*+$')
|
||||
COMPARISON = re.compile(r'<cs-[1234]>')
|
||||
CLOSE_COMPARISON = re.compile(r'</cs-[1234]>')
|
||||
GRAD_COMPARISON = re.compile(r'<cs-[123]>')
|
||||
NON_GRAD_COMPARISON = re.compile(r'<cs-4>')
|
||||
ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
|
||||
KEYWORD = re.compile(r'\((?!.*\()(.*)\)$')
|
||||
|
||||
|
||||
class Comparison(object):
|
||||
"""
|
||||
A Comparison represents a comparative sentence and its constituents.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text=None,
|
||||
comp_type=None,
|
||||
entity_1=None,
|
||||
entity_2=None,
|
||||
feature=None,
|
||||
keyword=None,
|
||||
):
|
||||
"""
|
||||
:param text: a string (optionally tokenized) containing a comparation.
|
||||
:param comp_type: an integer defining the type of comparison expressed.
|
||||
Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
|
||||
4 (Non-gradable).
|
||||
:param entity_1: the first entity considered in the comparison relation.
|
||||
:param entity_2: the second entity considered in the comparison relation.
|
||||
:param feature: the feature considered in the comparison relation.
|
||||
:param keyword: the word or phrase which is used for that comparative relation.
|
||||
"""
|
||||
self.text = text
|
||||
self.comp_type = comp_type
|
||||
self.entity_1 = entity_1
|
||||
self.entity_2 = entity_2
|
||||
self.feature = feature
|
||||
self.keyword = keyword
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
"Comparison(text=\"{}\", comp_type={}, entity_1=\"{}\", entity_2=\"{}\", "
|
||||
"feature=\"{}\", keyword=\"{}\")"
|
||||
).format(
|
||||
self.text,
|
||||
self.comp_type,
|
||||
self.entity_1,
|
||||
self.entity_2,
|
||||
self.feature,
|
||||
self.keyword,
|
||||
)
|
||||
|
||||
|
||||
class ComparativeSentencesCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).
|
||||
|
||||
>>> from nltk.corpus import comparative_sentences
|
||||
>>> comparison = comparative_sentences.comparisons()[0]
|
||||
>>> comparison.text
|
||||
['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
|
||||
'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
|
||||
'had', '.']
|
||||
>>> comparison.entity_2
|
||||
'models'
|
||||
>>> (comparison.feature, comparison.keyword)
|
||||
('rewind', 'more')
|
||||
>>> len(comparative_sentences.comparisons())
|
||||
853
|
||||
"""
|
||||
|
||||
CorpusView = StreamBackedCorpusView
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
word_tokenizer=WhitespaceTokenizer(),
|
||||
sent_tokenizer=None,
|
||||
encoding='utf8',
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: a list or regexp specifying the fileids in this corpus.
|
||||
:param word_tokenizer: tokenizer for breaking sentences or paragraphs
|
||||
into words. Default: `WhitespaceTokenizer`
|
||||
:param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
|
||||
:param encoding: the encoding that should be used to read the corpus.
|
||||
"""
|
||||
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
|
||||
def comparisons(self, fileids=None):
|
||||
"""
|
||||
Return all comparisons in the corpus.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
comparisons have to be returned.
|
||||
:return: the given file(s) as a list of Comparison objects.
|
||||
:rtype: list(Comparison)
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_comparison_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def keywords(self, fileids=None):
|
||||
"""
|
||||
Return a set of all keywords used in the corpus.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
keywords have to be returned.
|
||||
:return: the set of keywords and comparative phrases used in the corpus.
|
||||
:rtype: set(str)
|
||||
"""
|
||||
all_keywords = concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_keyword_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
keywords_set = set(keyword.lower() for keyword in all_keywords if keyword)
|
||||
return keywords_set
|
||||
|
||||
def keywords_readme(self):
|
||||
"""
|
||||
Return the list of words and constituents considered as clues of a
|
||||
comparison (from listOfkeywords.txt).
|
||||
"""
|
||||
keywords = []
|
||||
raw_text = self.open("listOfkeywords.txt").read()
|
||||
for line in raw_text.split("\n"):
|
||||
if not line or line.startswith("//"):
|
||||
continue
|
||||
keywords.append(line.strip())
|
||||
return keywords
|
||||
|
||||
def raw(self, fileids=None):
|
||||
"""
|
||||
:param fileids: a list or regexp specifying the fileids that have to be
|
||||
returned as a raw string.
|
||||
:return: the given file(s) as a single string.
|
||||
:rtype: str
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
def readme(self):
|
||||
"""
|
||||
Return the contents of the corpus readme file.
|
||||
"""
|
||||
return self.open("README.txt").read()
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
Return all sentences in the corpus.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
sentences have to be returned.
|
||||
:return: all sentences of the corpus as lists of tokens (or as plain
|
||||
strings, if no word tokenizer is specified).
|
||||
:rtype: list(list(str)) or list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
Return all words and punctuation symbols in the corpus.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
words have to be returned.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_comparison_block(self, stream):
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
return [] # end of file.
|
||||
comparison_tags = re.findall(COMPARISON, line)
|
||||
if comparison_tags:
|
||||
grad_comparisons = re.findall(GRAD_COMPARISON, line)
|
||||
non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line)
|
||||
# Advance to the next line (it contains the comparative sentence)
|
||||
comparison_text = stream.readline().strip()
|
||||
if self._word_tokenizer:
|
||||
comparison_text = self._word_tokenizer.tokenize(comparison_text)
|
||||
# Skip the next line (it contains closing comparison tags)
|
||||
stream.readline()
|
||||
# If gradable comparisons are found, create Comparison instances
|
||||
# and populate their fields
|
||||
comparison_bundle = []
|
||||
if grad_comparisons:
|
||||
# Each comparison tag has its own relations on a separate line
|
||||
for comp in grad_comparisons:
|
||||
comp_type = int(re.match(r'<cs-(\d)>', comp).group(1))
|
||||
comparison = Comparison(
|
||||
text=comparison_text, comp_type=comp_type
|
||||
)
|
||||
line = stream.readline()
|
||||
entities_feats = ENTITIES_FEATS.findall(line)
|
||||
if entities_feats:
|
||||
for (code, entity_feat) in entities_feats:
|
||||
if code == '1':
|
||||
comparison.entity_1 = entity_feat.strip()
|
||||
elif code == '2':
|
||||
comparison.entity_2 = entity_feat.strip()
|
||||
elif code == '3':
|
||||
comparison.feature = entity_feat.strip()
|
||||
keyword = KEYWORD.findall(line)
|
||||
if keyword:
|
||||
comparison.keyword = keyword[0]
|
||||
comparison_bundle.append(comparison)
|
||||
# If non-gradable comparisons are found, create a simple Comparison
|
||||
# instance for each one
|
||||
if non_grad_comparisons:
|
||||
for comp in non_grad_comparisons:
|
||||
# comp_type in this case should always be 4.
|
||||
comp_type = int(re.match(r'<cs-(\d)>', comp).group(1))
|
||||
comparison = Comparison(
|
||||
text=comparison_text, comp_type=comp_type
|
||||
)
|
||||
comparison_bundle.append(comparison)
|
||||
# Flatten the list of comparisons before returning them
|
||||
# return concat([comparison_bundle])
|
||||
return comparison_bundle
|
||||
|
||||
def _read_keyword_block(self, stream):
|
||||
keywords = []
|
||||
for comparison in self._read_comparison_block(stream):
|
||||
keywords.append(comparison.keyword)
|
||||
return keywords
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if re.match(STARS, line):
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if re.match(STARS, line):
|
||||
break
|
||||
continue
|
||||
if (
|
||||
not re.findall(COMPARISON, line)
|
||||
and not ENTITIES_FEATS.findall(line)
|
||||
and not re.findall(CLOSE_COMPARISON, line)
|
||||
):
|
||||
if self._sent_tokenizer:
|
||||
return [
|
||||
self._word_tokenizer.tokenize(sent)
|
||||
for sent in self._sent_tokenizer.tokenize(line)
|
||||
]
|
||||
else:
|
||||
return [self._word_tokenizer.tokenize(line)]
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for sent in self._read_sent_block(stream):
|
||||
words.extend(sent)
|
||||
return words
|
||||
592
venv/lib/python3.7/site-packages/nltk/corpus/reader/conll.py
Normal file
592
venv/lib/python3.7/site-packages/nltk/corpus/reader/conll.py
Normal file
@@ -0,0 +1,592 @@
|
||||
# Natural Language Toolkit: CONLL Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Read CoNLL-style chunk fileids.
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import textwrap
|
||||
|
||||
from nltk import compat
|
||||
from nltk.tree import Tree
|
||||
from nltk.util import LazyMap, LazyConcatenation
|
||||
from nltk.tag import map_tag
|
||||
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
|
||||
class ConllCorpusReader(CorpusReader):
|
||||
"""
|
||||
A corpus reader for CoNLL-style files. These files consist of a
|
||||
series of sentences, separated by blank lines. Each sentence is
|
||||
encoded using a table (or "grid") of values, where each line
|
||||
corresponds to a single word, and each column corresponds to an
|
||||
annotation type. The set of columns used by CoNLL-style files can
|
||||
vary from corpus to corpus; the ``ConllCorpusReader`` constructor
|
||||
therefore takes an argument, ``columntypes``, which is used to
|
||||
specify the columns that are used by a given corpus. By default
|
||||
columns are split by consecutive whitespaces, with the
|
||||
``separator`` argument you can set a string to split by (e.g.
|
||||
``\'\t\'``).
|
||||
|
||||
|
||||
@todo: Add support for reading from corpora where different
|
||||
parallel files contain different columns.
|
||||
@todo: Possibly add caching of the grid corpus view? This would
|
||||
allow the same grid view to be used by different data access
|
||||
methods (eg words() and parsed_sents() could both share the
|
||||
same grid corpus view object).
|
||||
@todo: Better support for -DOCSTART-. Currently, we just ignore
|
||||
it, but it could be used to define methods that retrieve a
|
||||
document at a time (eg parsed_documents()).
|
||||
"""
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Column Types
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
WORDS = 'words' #: column type for words
|
||||
POS = 'pos' #: column type for part-of-speech tags
|
||||
TREE = 'tree' #: column type for parse trees
|
||||
CHUNK = 'chunk' #: column type for chunk structures
|
||||
NE = 'ne' #: column type for named entities
|
||||
SRL = 'srl' #: column type for semantic role labels
|
||||
IGNORE = 'ignore' #: column type for column that should be ignored
|
||||
|
||||
#: A list of all column types supported by the conll corpus reader.
|
||||
COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Constructor
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
columntypes,
|
||||
chunk_types=None,
|
||||
root_label='S',
|
||||
pos_in_tree=False,
|
||||
srl_includes_roleset=True,
|
||||
encoding='utf8',
|
||||
tree_class=Tree,
|
||||
tagset=None,
|
||||
separator=None,
|
||||
):
|
||||
for columntype in columntypes:
|
||||
if columntype not in self.COLUMN_TYPES:
|
||||
raise ValueError('Bad column type %r' % columntype)
|
||||
if isinstance(chunk_types, string_types):
|
||||
chunk_types = [chunk_types]
|
||||
self._chunk_types = chunk_types
|
||||
self._colmap = dict((c, i) for (i, c) in enumerate(columntypes))
|
||||
self._pos_in_tree = pos_in_tree
|
||||
self._root_label = root_label # for chunks
|
||||
self._srl_includes_roleset = srl_includes_roleset
|
||||
self._tree_class = tree_class
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._tagset = tagset
|
||||
self.sep = separator
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Data Access Methods
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
def raw(self, fileids=None):
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
def words(self, fileids=None):
|
||||
self._require(self.WORDS)
|
||||
return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids)))
|
||||
|
||||
def sents(self, fileids=None):
|
||||
self._require(self.WORDS)
|
||||
return LazyMap(self._get_words, self._grids(fileids))
|
||||
|
||||
def tagged_words(self, fileids=None, tagset=None):
|
||||
self._require(self.WORDS, self.POS)
|
||||
|
||||
def get_tagged_words(grid):
|
||||
return self._get_tagged_words(grid, tagset)
|
||||
|
||||
return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids)))
|
||||
|
||||
def tagged_sents(self, fileids=None, tagset=None):
|
||||
self._require(self.WORDS, self.POS)
|
||||
|
||||
def get_tagged_words(grid):
|
||||
return self._get_tagged_words(grid, tagset)
|
||||
|
||||
return LazyMap(get_tagged_words, self._grids(fileids))
|
||||
|
||||
def chunked_words(self, fileids=None, chunk_types=None, tagset=None):
|
||||
self._require(self.WORDS, self.POS, self.CHUNK)
|
||||
if chunk_types is None:
|
||||
chunk_types = self._chunk_types
|
||||
|
||||
def get_chunked_words(grid): # capture chunk_types as local var
|
||||
return self._get_chunked_words(grid, chunk_types, tagset)
|
||||
|
||||
return LazyConcatenation(LazyMap(get_chunked_words, self._grids(fileids)))
|
||||
|
||||
def chunked_sents(self, fileids=None, chunk_types=None, tagset=None):
|
||||
self._require(self.WORDS, self.POS, self.CHUNK)
|
||||
if chunk_types is None:
|
||||
chunk_types = self._chunk_types
|
||||
|
||||
def get_chunked_words(grid): # capture chunk_types as local var
|
||||
return self._get_chunked_words(grid, chunk_types, tagset)
|
||||
|
||||
return LazyMap(get_chunked_words, self._grids(fileids))
|
||||
|
||||
def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None):
|
||||
self._require(self.WORDS, self.POS, self.TREE)
|
||||
if pos_in_tree is None:
|
||||
pos_in_tree = self._pos_in_tree
|
||||
|
||||
def get_parsed_sent(grid): # capture pos_in_tree as local var
|
||||
return self._get_parsed_sent(grid, pos_in_tree, tagset)
|
||||
|
||||
return LazyMap(get_parsed_sent, self._grids(fileids))
|
||||
|
||||
def srl_spans(self, fileids=None):
|
||||
self._require(self.SRL)
|
||||
return LazyMap(self._get_srl_spans, self._grids(fileids))
|
||||
|
||||
def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True):
|
||||
self._require(self.WORDS, self.POS, self.TREE, self.SRL)
|
||||
if pos_in_tree is None:
|
||||
pos_in_tree = self._pos_in_tree
|
||||
|
||||
def get_srl_instances(grid): # capture pos_in_tree as local var
|
||||
return self._get_srl_instances(grid, pos_in_tree)
|
||||
|
||||
result = LazyMap(get_srl_instances, self._grids(fileids))
|
||||
if flatten:
|
||||
result = LazyConcatenation(result)
|
||||
return result
|
||||
|
||||
def iob_words(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: a list of word/tag/IOB tuples
|
||||
:rtype: list(tuple)
|
||||
:param fileids: the list of fileids that make up this corpus
|
||||
:type fileids: None or str or list
|
||||
"""
|
||||
self._require(self.WORDS, self.POS, self.CHUNK)
|
||||
|
||||
def get_iob_words(grid):
|
||||
return self._get_iob_words(grid, tagset)
|
||||
|
||||
return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
|
||||
|
||||
def iob_sents(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: a list of lists of word/tag/IOB tuples
|
||||
:rtype: list(list)
|
||||
:param fileids: the list of fileids that make up this corpus
|
||||
:type fileids: None or str or list
|
||||
"""
|
||||
self._require(self.WORDS, self.POS, self.CHUNK)
|
||||
|
||||
def get_iob_words(grid):
|
||||
return self._get_iob_words(grid, tagset)
|
||||
|
||||
return LazyMap(get_iob_words, self._grids(fileids))
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Grid Reading
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
def _grids(self, fileids=None):
|
||||
# n.b.: we could cache the object returned here (keyed on
|
||||
# fileids), which would let us reuse the same corpus view for
|
||||
# different things (eg srl and parse trees).
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_grid_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_grid_block(self, stream):
|
||||
grids = []
|
||||
for block in read_blankline_block(stream):
|
||||
block = block.strip()
|
||||
if not block:
|
||||
continue
|
||||
|
||||
grid = [line.split(self.sep) for line in block.split('\n')]
|
||||
|
||||
# If there's a docstart row, then discard. ([xx] eventually it
|
||||
# would be good to actually use it)
|
||||
if grid[0][self._colmap.get('words', 0)] == '-DOCSTART-':
|
||||
del grid[0]
|
||||
|
||||
# Check that the grid is consistent.
|
||||
for row in grid:
|
||||
if len(row) != len(grid[0]):
|
||||
raise ValueError('Inconsistent number of columns:\n%s' % block)
|
||||
grids.append(grid)
|
||||
return grids
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Transforms
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# given a grid, transform it into some representation (e.g.,
|
||||
# a list of words or a parse tree).
|
||||
|
||||
def _get_words(self, grid):
|
||||
return self._get_column(grid, self._colmap['words'])
|
||||
|
||||
def _get_tagged_words(self, grid, tagset=None):
|
||||
pos_tags = self._get_column(grid, self._colmap['pos'])
|
||||
if tagset and tagset != self._tagset:
|
||||
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
|
||||
return list(zip(self._get_column(grid, self._colmap['words']), pos_tags))
|
||||
|
||||
def _get_iob_words(self, grid, tagset=None):
|
||||
pos_tags = self._get_column(grid, self._colmap['pos'])
|
||||
if tagset and tagset != self._tagset:
|
||||
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
|
||||
return list(
|
||||
zip(
|
||||
self._get_column(grid, self._colmap['words']),
|
||||
pos_tags,
|
||||
self._get_column(grid, self._colmap['chunk']),
|
||||
)
|
||||
)
|
||||
|
||||
def _get_chunked_words(self, grid, chunk_types, tagset=None):
|
||||
# n.b.: this method is very similar to conllstr2tree.
|
||||
words = self._get_column(grid, self._colmap['words'])
|
||||
pos_tags = self._get_column(grid, self._colmap['pos'])
|
||||
if tagset and tagset != self._tagset:
|
||||
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
|
||||
chunk_tags = self._get_column(grid, self._colmap['chunk'])
|
||||
|
||||
stack = [Tree(self._root_label, [])]
|
||||
|
||||
for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags):
|
||||
if chunk_tag == 'O':
|
||||
state, chunk_type = 'O', ''
|
||||
else:
|
||||
(state, chunk_type) = chunk_tag.split('-')
|
||||
# If it's a chunk we don't care about, treat it as O.
|
||||
if chunk_types is not None and chunk_type not in chunk_types:
|
||||
state = 'O'
|
||||
# Treat a mismatching I like a B.
|
||||
if state == 'I' and chunk_type != stack[-1].label():
|
||||
state = 'B'
|
||||
# For B or I: close any open chunks
|
||||
if state in 'BO' and len(stack) == 2:
|
||||
stack.pop()
|
||||
# For B: start a new chunk.
|
||||
if state == 'B':
|
||||
new_chunk = Tree(chunk_type, [])
|
||||
stack[-1].append(new_chunk)
|
||||
stack.append(new_chunk)
|
||||
# Add the word token.
|
||||
stack[-1].append((word, pos_tag))
|
||||
|
||||
return stack[0]
|
||||
|
||||
def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
|
||||
words = self._get_column(grid, self._colmap['words'])
|
||||
pos_tags = self._get_column(grid, self._colmap['pos'])
|
||||
if tagset and tagset != self._tagset:
|
||||
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
|
||||
parse_tags = self._get_column(grid, self._colmap['tree'])
|
||||
|
||||
treestr = ''
|
||||
for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags):
|
||||
if word == '(':
|
||||
word = '-LRB-'
|
||||
if word == ')':
|
||||
word = '-RRB-'
|
||||
if pos_tag == '(':
|
||||
pos_tag = '-LRB-'
|
||||
if pos_tag == ')':
|
||||
pos_tag = '-RRB-'
|
||||
(left, right) = parse_tag.split('*')
|
||||
right = right.count(')') * ')' # only keep ')'.
|
||||
treestr += '%s (%s %s) %s' % (left, pos_tag, word, right)
|
||||
try:
|
||||
tree = self._tree_class.fromstring(treestr)
|
||||
except (ValueError, IndexError):
|
||||
tree = self._tree_class.fromstring('(%s %s)' % (self._root_label, treestr))
|
||||
|
||||
if not pos_in_tree:
|
||||
for subtree in tree.subtrees():
|
||||
for i, child in enumerate(subtree):
|
||||
if (
|
||||
isinstance(child, Tree)
|
||||
and len(child) == 1
|
||||
and isinstance(child[0], string_types)
|
||||
):
|
||||
subtree[i] = (child[0], child.label())
|
||||
|
||||
return tree
|
||||
|
||||
def _get_srl_spans(self, grid):
|
||||
"""
|
||||
list of list of (start, end), tag) tuples
|
||||
"""
|
||||
if self._srl_includes_roleset:
|
||||
predicates = self._get_column(grid, self._colmap['srl'] + 1)
|
||||
start_col = self._colmap['srl'] + 2
|
||||
else:
|
||||
predicates = self._get_column(grid, self._colmap['srl'])
|
||||
start_col = self._colmap['srl'] + 1
|
||||
|
||||
# Count how many predicates there are. This tells us how many
|
||||
# columns to expect for SRL data.
|
||||
num_preds = len([p for p in predicates if p != '-'])
|
||||
|
||||
spanlists = []
|
||||
for i in range(num_preds):
|
||||
col = self._get_column(grid, start_col + i)
|
||||
spanlist = []
|
||||
stack = []
|
||||
for wordnum, srl_tag in enumerate(col):
|
||||
(left, right) = srl_tag.split('*')
|
||||
for tag in left.split('('):
|
||||
if tag:
|
||||
stack.append((tag, wordnum))
|
||||
for i in range(right.count(')')):
|
||||
(tag, start) = stack.pop()
|
||||
spanlist.append(((start, wordnum + 1), tag))
|
||||
spanlists.append(spanlist)
|
||||
|
||||
return spanlists
|
||||
|
||||
def _get_srl_instances(self, grid, pos_in_tree):
|
||||
tree = self._get_parsed_sent(grid, pos_in_tree)
|
||||
spanlists = self._get_srl_spans(grid)
|
||||
if self._srl_includes_roleset:
|
||||
predicates = self._get_column(grid, self._colmap['srl'] + 1)
|
||||
rolesets = self._get_column(grid, self._colmap['srl'])
|
||||
else:
|
||||
predicates = self._get_column(grid, self._colmap['srl'])
|
||||
rolesets = [None] * len(predicates)
|
||||
|
||||
instances = ConllSRLInstanceList(tree)
|
||||
for wordnum, predicate in enumerate(predicates):
|
||||
if predicate == '-':
|
||||
continue
|
||||
# Decide which spanlist to use. Don't assume that they're
|
||||
# sorted in the same order as the predicates (even though
|
||||
# they usually are).
|
||||
for spanlist in spanlists:
|
||||
for (start, end), tag in spanlist:
|
||||
if wordnum in range(start, end) and tag in ('V', 'C-V'):
|
||||
break
|
||||
else:
|
||||
continue
|
||||
break
|
||||
else:
|
||||
raise ValueError('No srl column found for %r' % predicate)
|
||||
instances.append(
|
||||
ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist)
|
||||
)
|
||||
|
||||
return instances
|
||||
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
# Helper Methods
|
||||
# /////////////////////////////////////////////////////////////////
|
||||
|
||||
def _require(self, *columntypes):
|
||||
for columntype in columntypes:
|
||||
if columntype not in self._colmap:
|
||||
raise ValueError(
|
||||
'This corpus does not contain a %s ' 'column.' % columntype
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_column(grid, column_index):
|
||||
return [grid[i][column_index] for i in range(len(grid))]
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class ConllSRLInstance(object):
|
||||
"""
|
||||
An SRL instance from a CoNLL corpus, which identifies and
|
||||
providing labels for the arguments of a single verb.
|
||||
"""
|
||||
|
||||
# [xx] add inst.core_arguments, inst.argm_arguments?
|
||||
|
||||
def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):
|
||||
self.verb = []
|
||||
"""A list of the word indices of the words that compose the
|
||||
verb whose arguments are identified by this instance.
|
||||
This will contain multiple word indices when multi-word
|
||||
verbs are used (e.g. 'turn on')."""
|
||||
|
||||
self.verb_head = verb_head
|
||||
"""The word index of the head word of the verb whose arguments
|
||||
are identified by this instance. E.g., for a sentence that
|
||||
uses the verb 'turn on,' ``verb_head`` will be the word index
|
||||
of the word 'turn'."""
|
||||
|
||||
self.verb_stem = verb_stem
|
||||
|
||||
self.roleset = roleset
|
||||
|
||||
self.arguments = []
|
||||
"""A list of ``(argspan, argid)`` tuples, specifying the location
|
||||
and type for each of the arguments identified by this
|
||||
instance. ``argspan`` is a tuple ``start, end``, indicating
|
||||
that the argument consists of the ``words[start:end]``."""
|
||||
|
||||
self.tagged_spans = tagged_spans
|
||||
"""A list of ``(span, id)`` tuples, specifying the location and
|
||||
type for each of the arguments, as well as the verb pieces,
|
||||
that make up this instance."""
|
||||
|
||||
self.tree = tree
|
||||
"""The parse tree for the sentence containing this instance."""
|
||||
|
||||
self.words = tree.leaves()
|
||||
"""A list of the words in the sentence containing this
|
||||
instance."""
|
||||
|
||||
# Fill in the self.verb and self.arguments values.
|
||||
for (start, end), tag in tagged_spans:
|
||||
if tag in ('V', 'C-V'):
|
||||
self.verb += list(range(start, end))
|
||||
else:
|
||||
self.arguments.append(((start, end), tag))
|
||||
|
||||
def __repr__(self):
|
||||
# Originally, its:
|
||||
##plural = 's' if len(self.arguments) != 1 else ''
|
||||
plural = 's' if len(self.arguments) != 1 else ''
|
||||
return '<ConllSRLInstance for %r with %d argument%s>' % (
|
||||
(self.verb_stem, len(self.arguments), plural)
|
||||
)
|
||||
|
||||
def pprint(self):
|
||||
verbstr = ' '.join(self.words[i][0] for i in self.verb)
|
||||
hdr = 'SRL for %r (stem=%r):\n' % (verbstr, self.verb_stem)
|
||||
s = ''
|
||||
for i, word in enumerate(self.words):
|
||||
if isinstance(word, tuple):
|
||||
word = word[0]
|
||||
for (start, end), argid in self.arguments:
|
||||
if i == start:
|
||||
s += '[%s ' % argid
|
||||
if i == end:
|
||||
s += '] '
|
||||
if i in self.verb:
|
||||
word = '<<%s>>' % word
|
||||
s += word + ' '
|
||||
return hdr + textwrap.fill(
|
||||
s.replace(' ]', ']'), initial_indent=' ', subsequent_indent=' '
|
||||
)
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class ConllSRLInstanceList(list):
|
||||
"""
|
||||
Set of instances for a single sentence
|
||||
"""
|
||||
|
||||
def __init__(self, tree, instances=()):
|
||||
self.tree = tree
|
||||
list.__init__(self, instances)
|
||||
|
||||
def __str__(self):
|
||||
return self.pprint()
|
||||
|
||||
def pprint(self, include_tree=False):
|
||||
# Sanity check: trees should be the same
|
||||
for inst in self:
|
||||
if inst.tree != self.tree:
|
||||
raise ValueError('Tree mismatch!')
|
||||
|
||||
# If desired, add trees:
|
||||
if include_tree:
|
||||
words = self.tree.leaves()
|
||||
pos = [None] * len(words)
|
||||
synt = ['*'] * len(words)
|
||||
self._tree2conll(self.tree, 0, words, pos, synt)
|
||||
|
||||
s = ''
|
||||
for i in range(len(words)):
|
||||
# optional tree columns
|
||||
if include_tree:
|
||||
s += '%-20s ' % words[i]
|
||||
s += '%-8s ' % pos[i]
|
||||
s += '%15s*%-8s ' % tuple(synt[i].split('*'))
|
||||
|
||||
# verb head column
|
||||
for inst in self:
|
||||
if i == inst.verb_head:
|
||||
s += '%-20s ' % inst.verb_stem
|
||||
break
|
||||
else:
|
||||
s += '%-20s ' % '-'
|
||||
# Remaining columns: self
|
||||
for inst in self:
|
||||
argstr = '*'
|
||||
for (start, end), argid in inst.tagged_spans:
|
||||
if i == start:
|
||||
argstr = '(%s%s' % (argid, argstr)
|
||||
if i == (end - 1):
|
||||
argstr += ')'
|
||||
s += '%-12s ' % argstr
|
||||
s += '\n'
|
||||
return s
|
||||
|
||||
def _tree2conll(self, tree, wordnum, words, pos, synt):
|
||||
assert isinstance(tree, Tree)
|
||||
if len(tree) == 1 and isinstance(tree[0], string_types):
|
||||
pos[wordnum] = tree.label()
|
||||
assert words[wordnum] == tree[0]
|
||||
return wordnum + 1
|
||||
elif len(tree) == 1 and isinstance(tree[0], tuple):
|
||||
assert len(tree[0]) == 2
|
||||
pos[wordnum], pos[wordnum] = tree[0]
|
||||
return wordnum + 1
|
||||
else:
|
||||
synt[wordnum] = '(%s%s' % (tree.label(), synt[wordnum])
|
||||
for child in tree:
|
||||
wordnum = self._tree2conll(child, wordnum, words, pos, synt)
|
||||
synt[wordnum - 1] += ')'
|
||||
return wordnum
|
||||
|
||||
|
||||
class ConllChunkCorpusReader(ConllCorpusReader):
|
||||
"""
|
||||
A ConllCorpusReader whose data file contains three columns: words,
|
||||
pos, and chunk.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, root, fileids, chunk_types, encoding='utf8', tagset=None, separator=None
|
||||
):
|
||||
ConllCorpusReader.__init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
('words', 'pos', 'chunk'),
|
||||
chunk_types=chunk_types,
|
||||
encoding=encoding,
|
||||
tagset=tagset,
|
||||
separator=separator,
|
||||
)
|
||||
119
venv/lib/python3.7/site-packages/nltk/corpus/reader/crubadan.py
Normal file
119
venv/lib/python3.7/site-packages/nltk/corpus/reader/crubadan.py
Normal file
@@ -0,0 +1,119 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Natural Language Toolkit: An Crubadan N-grams Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Avital Pekker <avital.pekker@utoronto.ca>
|
||||
#
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
An NLTK interface for the n-gram statistics gathered from
|
||||
the corpora for each language using An Crubadan.
|
||||
|
||||
There are multiple potential applications for the data but
|
||||
this reader was created with the goal of using it in the
|
||||
context of language identification.
|
||||
|
||||
For details about An Crubadan, this data, and its potential uses, see:
|
||||
http://borel.slu.edu/crubadan/index.html
|
||||
"""
|
||||
|
||||
from __future__ import print_function, unicode_literals
|
||||
|
||||
import re
|
||||
from os import path
|
||||
|
||||
from nltk.compat import PY3
|
||||
from nltk.corpus.reader import CorpusReader
|
||||
from nltk.probability import FreqDist
|
||||
from nltk.data import ZipFilePathPointer
|
||||
|
||||
|
||||
class CrubadanCorpusReader(CorpusReader):
|
||||
"""
|
||||
A corpus reader used to access language An Crubadan n-gram files.
|
||||
"""
|
||||
|
||||
_LANG_MAPPER_FILE = 'table.txt'
|
||||
_all_lang_freq = {}
|
||||
|
||||
def __init__(self, root, fileids, encoding='utf8', tagset=None):
|
||||
super(CrubadanCorpusReader, self).__init__(root, fileids, encoding='utf8')
|
||||
self._lang_mapping_data = []
|
||||
self._load_lang_mapping_data()
|
||||
|
||||
def lang_freq(self, lang):
|
||||
''' Return n-gram FreqDist for a specific language
|
||||
given ISO 639-3 language code '''
|
||||
|
||||
if lang not in self._all_lang_freq:
|
||||
self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
|
||||
|
||||
return self._all_lang_freq[lang]
|
||||
|
||||
def langs(self):
|
||||
''' Return a list of supported languages as ISO 639-3 codes '''
|
||||
return [row[1] for row in self._lang_mapping_data]
|
||||
|
||||
def iso_to_crubadan(self, lang):
|
||||
''' Return internal Crubadan code based on ISO 639-3 code '''
|
||||
for i in self._lang_mapping_data:
|
||||
if i[1].lower() == lang.lower():
|
||||
return i[0]
|
||||
|
||||
def crubadan_to_iso(self, lang):
|
||||
''' Return ISO 639-3 code given internal Crubadan code '''
|
||||
for i in self._lang_mapping_data:
|
||||
if i[0].lower() == lang.lower():
|
||||
return i[1]
|
||||
|
||||
def _load_lang_mapping_data(self):
|
||||
''' Load language mappings between codes and description from table.txt '''
|
||||
if isinstance(self.root, ZipFilePathPointer):
|
||||
raise RuntimeError(
|
||||
"Please install the 'crubadan' corpus first, use nltk.download()"
|
||||
)
|
||||
|
||||
mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
|
||||
if self._LANG_MAPPER_FILE not in self.fileids():
|
||||
raise RuntimeError("Could not find language mapper file: " + mapper_file)
|
||||
|
||||
if PY3:
|
||||
raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
|
||||
else:
|
||||
raw = open(mapper_file, 'rU').read().decode('utf-8').strip()
|
||||
|
||||
self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')]
|
||||
|
||||
def _load_lang_ngrams(self, lang):
|
||||
''' Load single n-gram language file given the ISO 639-3 language code
|
||||
and return its FreqDist '''
|
||||
|
||||
if lang not in self.langs():
|
||||
raise RuntimeError("Unsupported language.")
|
||||
|
||||
crubadan_code = self.iso_to_crubadan(lang)
|
||||
ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')
|
||||
|
||||
if not path.isfile(ngram_file):
|
||||
raise RuntimeError("No N-gram file found for requested language.")
|
||||
|
||||
counts = FreqDist()
|
||||
if PY3:
|
||||
f = open(ngram_file, 'r', encoding='utf-8')
|
||||
else:
|
||||
f = open(ngram_file, 'rU')
|
||||
|
||||
for line in f:
|
||||
if PY3:
|
||||
data = line.split(' ')
|
||||
else:
|
||||
data = line.decode('utf8').split(' ')
|
||||
|
||||
ngram = data[1].strip('\n')
|
||||
freq = int(data[0])
|
||||
|
||||
counts[ngram] = freq
|
||||
|
||||
return counts
|
||||
@@ -0,0 +1,134 @@
|
||||
# Natural Language Toolkit: Dependency Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Kepa Sarasola <kepa.sarasola@ehu.es>
|
||||
# Iker Manterola <returntothehangar@hotmail.com>
|
||||
#
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import codecs
|
||||
|
||||
from nltk.parse import DependencyGraph
|
||||
from nltk.tokenize import *
|
||||
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
|
||||
class DependencyCorpusReader(SyntaxCorpusReader):
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
encoding='utf8',
|
||||
word_tokenizer=TabTokenizer(),
|
||||
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
|
||||
para_block_reader=read_blankline_block,
|
||||
):
|
||||
# FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
|
||||
# from CorpusReader?
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
|
||||
#########################################################
|
||||
|
||||
def raw(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a single string.
|
||||
:rtype: str
|
||||
"""
|
||||
result = []
|
||||
for fileid, encoding in self.abspaths(fileids, include_encoding=True):
|
||||
if isinstance(fileid, PathPointer):
|
||||
result.append(fileid.open(encoding=encoding).read())
|
||||
else:
|
||||
with codecs.open(fileid, "r", encoding) as fp:
|
||||
result.append(fp.read())
|
||||
return concat(result)
|
||||
|
||||
def words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
DependencyCorpusView(fileid, False, False, False, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
DependencyCorpusView(fileid, True, False, False, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
DependencyCorpusView(fileid, False, True, False, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
DependencyCorpusView(fileid, True, True, False, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||||
]
|
||||
)
|
||||
|
||||
def parsed_sents(self, fileids=None):
|
||||
sents = concat(
|
||||
[
|
||||
DependencyCorpusView(fileid, False, True, True, encoding=enc)
|
||||
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||||
]
|
||||
)
|
||||
return [DependencyGraph(sent) for sent in sents]
|
||||
|
||||
|
||||
class DependencyCorpusView(StreamBackedCorpusView):
|
||||
_DOCSTART = '-DOCSTART- -DOCSTART- O\n' # dokumentu hasiera definitzen da
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
corpus_file,
|
||||
tagged,
|
||||
group_by_sent,
|
||||
dependencies,
|
||||
chunk_types=None,
|
||||
encoding='utf8',
|
||||
):
|
||||
self._tagged = tagged
|
||||
self._dependencies = dependencies
|
||||
self._group_by_sent = group_by_sent
|
||||
self._chunk_types = chunk_types
|
||||
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
||||
|
||||
def read_block(self, stream):
|
||||
# Read the next sentence.
|
||||
sent = read_blankline_block(stream)[0].strip()
|
||||
# Strip off the docstart marker, if present.
|
||||
if sent.startswith(self._DOCSTART):
|
||||
sent = sent[len(self._DOCSTART) :].lstrip()
|
||||
|
||||
# extract word and tag from any of the formats
|
||||
if not self._dependencies:
|
||||
lines = [line.split('\t') for line in sent.split('\n')]
|
||||
if len(lines[0]) == 3 or len(lines[0]) == 4:
|
||||
sent = [(line[0], line[1]) for line in lines]
|
||||
elif len(lines[0]) == 10:
|
||||
sent = [(line[1], line[4]) for line in lines]
|
||||
else:
|
||||
raise ValueError('Unexpected number of fields in dependency tree file')
|
||||
|
||||
# discard tags if they weren't requested
|
||||
if not self._tagged:
|
||||
sent = [word for (word, tag) in sent]
|
||||
|
||||
# Return the result.
|
||||
if self._group_by_sent:
|
||||
return [sent]
|
||||
else:
|
||||
return list(sent)
|
||||
3459
venv/lib/python3.7/site-packages/nltk/corpus/reader/framenet.py
Normal file
3459
venv/lib/python3.7/site-packages/nltk/corpus/reader/framenet.py
Normal file
File diff suppressed because it is too large
Load Diff
129
venv/lib/python3.7/site-packages/nltk/corpus/reader/ieer.py
Normal file
129
venv/lib/python3.7/site-packages/nltk/corpus/reader/ieer.py
Normal file
@@ -0,0 +1,129 @@
|
||||
# Natural Language Toolkit: IEER Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Corpus reader for the Information Extraction and Entity Recognition Corpus.
|
||||
|
||||
NIST 1999 Information Extraction: Entity Recognition Evaluation
|
||||
http://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm
|
||||
|
||||
This corpus contains the NEWSWIRE development test data for the
|
||||
NIST 1999 IE-ER Evaluation. The files were taken from the
|
||||
subdirectory: /ie_er_99/english/devtest/newswire/*.ref.nwt
|
||||
and filenames were shortened.
|
||||
|
||||
The corpus contains the following files: APW_19980314, APW_19980424,
|
||||
APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407.
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from six import string_types
|
||||
|
||||
import nltk
|
||||
from nltk import compat
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
#: A dictionary whose keys are the names of documents in this corpus;
|
||||
#: and whose values are descriptions of those documents' contents.
|
||||
titles = {
|
||||
'APW_19980314': 'Associated Press Weekly, 14 March 1998',
|
||||
'APW_19980424': 'Associated Press Weekly, 24 April 1998',
|
||||
'APW_19980429': 'Associated Press Weekly, 29 April 1998',
|
||||
'NYT_19980315': 'New York Times, 15 March 1998',
|
||||
'NYT_19980403': 'New York Times, 3 April 1998',
|
||||
'NYT_19980407': 'New York Times, 7 April 1998',
|
||||
}
|
||||
|
||||
#: A list of all documents in this corpus.
|
||||
documents = sorted(titles)
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class IEERDocument(object):
|
||||
def __init__(self, text, docno=None, doctype=None, date_time=None, headline=''):
|
||||
self.text = text
|
||||
self.docno = docno
|
||||
self.doctype = doctype
|
||||
self.date_time = date_time
|
||||
self.headline = headline
|
||||
|
||||
def __repr__(self):
|
||||
if self.headline:
|
||||
headline = ' '.join(self.headline.leaves())
|
||||
else:
|
||||
headline = (
|
||||
' '.join([w for w in self.text.leaves() if w[:1] != '<'][:12]) + '...'
|
||||
)
|
||||
if self.docno is not None:
|
||||
return '<IEERDocument %s: %r>' % (self.docno, headline)
|
||||
else:
|
||||
return '<IEERDocument: %r>' % headline
|
||||
|
||||
|
||||
class IEERCorpusReader(CorpusReader):
|
||||
"""
|
||||
"""
|
||||
|
||||
def raw(self, fileids=None):
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
def docs(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def parsed_docs(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_parsed_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_parsed_block(self, stream):
|
||||
# TODO: figure out while empty documents are being returned
|
||||
return [
|
||||
self._parse(doc)
|
||||
for doc in self._read_block(stream)
|
||||
if self._parse(doc).docno is not None
|
||||
]
|
||||
|
||||
def _parse(self, doc):
|
||||
val = nltk.chunk.ieerstr2tree(doc, root_label="DOCUMENT")
|
||||
if isinstance(val, dict):
|
||||
return IEERDocument(**val)
|
||||
else:
|
||||
return IEERDocument(val)
|
||||
|
||||
def _read_block(self, stream):
|
||||
out = []
|
||||
# Skip any preamble.
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
break
|
||||
if line.strip() == '<DOC>':
|
||||
break
|
||||
out.append(line)
|
||||
# Read the document
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
break
|
||||
out.append(line)
|
||||
if line.strip() == '</DOC>':
|
||||
break
|
||||
# Return the document
|
||||
return ['\n'.join(out)]
|
||||
103
venv/lib/python3.7/site-packages/nltk/corpus/reader/indian.py
Normal file
103
venv/lib/python3.7/site-packages/nltk/corpus/reader/indian.py
Normal file
@@ -0,0 +1,103 @@
|
||||
# Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Indian Language POS-Tagged Corpus
|
||||
Collected by A Kumaran, Microsoft Research, India
|
||||
Distributed with permission
|
||||
|
||||
Contents:
|
||||
- Bangla: IIT Kharagpur
|
||||
- Hindi: Microsoft Research India
|
||||
- Marathi: IIT Bombay
|
||||
- Telugu: IIIT Hyderabad
|
||||
"""
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk.tag import str2tuple, map_tag
|
||||
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
|
||||
class IndianCorpusReader(CorpusReader):
|
||||
"""
|
||||
List of words, one per line. Blank lines are ignored.
|
||||
"""
|
||||
|
||||
def words(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
IndianCorpusView(fileid, enc, False, False)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None, tagset=None):
|
||||
if tagset and tagset != self._tagset:
|
||||
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
|
||||
else:
|
||||
tag_mapping_function = None
|
||||
return concat(
|
||||
[
|
||||
IndianCorpusView(fileid, enc, True, False, tag_mapping_function)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
IndianCorpusView(fileid, enc, False, True)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, tagset=None):
|
||||
if tagset and tagset != self._tagset:
|
||||
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
|
||||
else:
|
||||
tag_mapping_function = None
|
||||
return concat(
|
||||
[
|
||||
IndianCorpusView(fileid, enc, True, True, tag_mapping_function)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def raw(self, fileids=None):
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
|
||||
class IndianCorpusView(StreamBackedCorpusView):
|
||||
def __init__(
|
||||
self, corpus_file, encoding, tagged, group_by_sent, tag_mapping_function=None
|
||||
):
|
||||
self._tagged = tagged
|
||||
self._group_by_sent = group_by_sent
|
||||
self._tag_mapping_function = tag_mapping_function
|
||||
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
||||
|
||||
def read_block(self, stream):
|
||||
line = stream.readline()
|
||||
if line.startswith('<'):
|
||||
return []
|
||||
sent = [str2tuple(word, sep='_') for word in line.split()]
|
||||
if self._tag_mapping_function:
|
||||
sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
|
||||
if not self._tagged:
|
||||
sent = [w for (w, t) in sent]
|
||||
if self._group_by_sent:
|
||||
return [sent]
|
||||
else:
|
||||
return sent
|
||||
368
venv/lib/python3.7/site-packages/nltk/corpus/reader/ipipan.py
Normal file
368
venv/lib/python3.7/site-packages/nltk/corpus/reader/ipipan.py
Normal file
@@ -0,0 +1,368 @@
|
||||
# Natural Language Toolkit: IPI PAN Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Konrad Goluchowski <kodie@mimuw.edu.pl>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import functools
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk.corpus.reader.util import StreamBackedCorpusView, concat
|
||||
from nltk.corpus.reader.api import CorpusReader
|
||||
|
||||
|
||||
def _parse_args(fun):
|
||||
@functools.wraps(fun)
|
||||
def decorator(self, fileids=None, **kwargs):
|
||||
kwargs.pop('tags', None)
|
||||
if not fileids:
|
||||
fileids = self.fileids()
|
||||
return fun(self, fileids, **kwargs)
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
class IPIPANCorpusReader(CorpusReader):
|
||||
"""
|
||||
Corpus reader designed to work with corpus created by IPI PAN.
|
||||
See http://korpus.pl/en/ for more details about IPI PAN corpus.
|
||||
|
||||
The corpus includes information about text domain, channel and categories.
|
||||
You can access possible values using ``domains()``, ``channels()`` and
|
||||
``categories()``. You can use also this metadata to filter files, e.g.:
|
||||
``fileids(channel='prasa')``, ``fileids(categories='publicystyczny')``.
|
||||
|
||||
The reader supports methods: words, sents, paras and their tagged versions.
|
||||
You can get part of speech instead of full tag by giving "simplify_tags=True"
|
||||
parameter, e.g.: ``tagged_sents(simplify_tags=True)``.
|
||||
|
||||
Also you can get all tags disambiguated tags specifying parameter
|
||||
"one_tag=False", e.g.: ``tagged_paras(one_tag=False)``.
|
||||
|
||||
You can get all tags that were assigned by a morphological analyzer specifying
|
||||
parameter "disamb_only=False", e.g. ``tagged_words(disamb_only=False)``.
|
||||
|
||||
The IPIPAN Corpus contains tags indicating if there is a space between two
|
||||
tokens. To add special "no space" markers, you should specify parameter
|
||||
"append_no_space=True", e.g. ``tagged_words(append_no_space=True)``.
|
||||
As a result in place where there should be no space between two tokens new
|
||||
pair ('', 'no-space') will be inserted (for tagged data) and just '' for
|
||||
methods without tags.
|
||||
|
||||
The corpus reader can also try to append spaces between words. To enable this
|
||||
option, specify parameter "append_space=True", e.g. ``words(append_space=True)``.
|
||||
As a result either ' ' or (' ', 'space') will be inserted between tokens.
|
||||
|
||||
By default, xml entities like " and & are replaced by corresponding
|
||||
characters. You can turn off this feature, specifying parameter
|
||||
"replace_xmlentities=False", e.g. ``words(replace_xmlentities=False)``.
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids):
|
||||
CorpusReader.__init__(self, root, fileids, None, None)
|
||||
|
||||
def raw(self, fileids=None):
|
||||
if not fileids:
|
||||
fileids = self.fileids()
|
||||
|
||||
filecontents = []
|
||||
for fileid in self._list_morph_files(fileids):
|
||||
with open(fileid, 'r') as infile:
|
||||
filecontents.append(infile.read())
|
||||
return ''.join(filecontents)
|
||||
|
||||
def channels(self, fileids=None):
|
||||
if not fileids:
|
||||
fileids = self.fileids()
|
||||
return self._parse_header(fileids, 'channel')
|
||||
|
||||
def domains(self, fileids=None):
|
||||
if not fileids:
|
||||
fileids = self.fileids()
|
||||
return self._parse_header(fileids, 'domain')
|
||||
|
||||
def categories(self, fileids=None):
|
||||
if not fileids:
|
||||
fileids = self.fileids()
|
||||
return [
|
||||
self._map_category(cat) for cat in self._parse_header(fileids, 'keyTerm')
|
||||
]
|
||||
|
||||
def fileids(self, channels=None, domains=None, categories=None):
|
||||
if channels is not None and domains is not None and categories is not None:
|
||||
raise ValueError(
|
||||
'You can specify only one of channels, domains '
|
||||
'and categories parameter at once'
|
||||
)
|
||||
if channels is None and domains is None and categories is None:
|
||||
return CorpusReader.fileids(self)
|
||||
if isinstance(channels, string_types):
|
||||
channels = [channels]
|
||||
if isinstance(domains, string_types):
|
||||
domains = [domains]
|
||||
if isinstance(categories, string_types):
|
||||
categories = [categories]
|
||||
if channels:
|
||||
return self._list_morph_files_by('channel', channels)
|
||||
elif domains:
|
||||
return self._list_morph_files_by('domain', domains)
|
||||
else:
|
||||
return self._list_morph_files_by(
|
||||
'keyTerm', categories, map=self._map_category
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def sents(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
fileid, mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs
|
||||
)
|
||||
for fileid in self._list_morph_files(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def paras(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
fileid, mode=IPIPANCorpusView.PARAS_MODE, tags=False, **kwargs
|
||||
)
|
||||
for fileid in self._list_morph_files(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def words(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[
|
||||
self._view(fileid, tags=False, **kwargs)
|
||||
for fileid in self._list_morph_files(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def tagged_sents(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[
|
||||
self._view(fileid, mode=IPIPANCorpusView.SENTS_MODE, **kwargs)
|
||||
for fileid in self._list_morph_files(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def tagged_paras(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[
|
||||
self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE, **kwargs)
|
||||
for fileid in self._list_morph_files(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def tagged_words(self, fileids=None, **kwargs):
|
||||
return concat(
|
||||
[self._view(fileid, **kwargs) for fileid in self._list_morph_files(fileids)]
|
||||
)
|
||||
|
||||
def _list_morph_files(self, fileids):
|
||||
return [f for f in self.abspaths(fileids)]
|
||||
|
||||
def _list_header_files(self, fileids):
|
||||
return [
|
||||
f.replace('morph.xml', 'header.xml')
|
||||
for f in self._list_morph_files(fileids)
|
||||
]
|
||||
|
||||
def _parse_header(self, fileids, tag):
|
||||
values = set()
|
||||
for f in self._list_header_files(fileids):
|
||||
values_list = self._get_tag(f, tag)
|
||||
for v in values_list:
|
||||
values.add(v)
|
||||
return list(values)
|
||||
|
||||
def _list_morph_files_by(self, tag, values, map=None):
|
||||
fileids = self.fileids()
|
||||
ret_fileids = set()
|
||||
for f in fileids:
|
||||
fp = self.abspath(f).replace('morph.xml', 'header.xml')
|
||||
values_list = self._get_tag(fp, tag)
|
||||
for value in values_list:
|
||||
if map is not None:
|
||||
value = map(value)
|
||||
if value in values:
|
||||
ret_fileids.add(f)
|
||||
return list(ret_fileids)
|
||||
|
||||
def _get_tag(self, f, tag):
|
||||
tags = []
|
||||
with open(f, 'r') as infile:
|
||||
header = infile.read()
|
||||
tag_end = 0
|
||||
while True:
|
||||
tag_pos = header.find('<' + tag, tag_end)
|
||||
if tag_pos < 0:
|
||||
return tags
|
||||
tag_end = header.find('</' + tag + '>', tag_pos)
|
||||
tags.append(header[tag_pos + len(tag) + 2 : tag_end])
|
||||
|
||||
def _map_category(self, cat):
|
||||
pos = cat.find('>')
|
||||
if pos == -1:
|
||||
return cat
|
||||
else:
|
||||
return cat[pos + 1 :]
|
||||
|
||||
def _view(self, filename, **kwargs):
|
||||
tags = kwargs.pop('tags', True)
|
||||
mode = kwargs.pop('mode', 0)
|
||||
simplify_tags = kwargs.pop('simplify_tags', False)
|
||||
one_tag = kwargs.pop('one_tag', True)
|
||||
disamb_only = kwargs.pop('disamb_only', True)
|
||||
append_no_space = kwargs.pop('append_no_space', False)
|
||||
append_space = kwargs.pop('append_space', False)
|
||||
replace_xmlentities = kwargs.pop('replace_xmlentities', True)
|
||||
|
||||
if len(kwargs) > 0:
|
||||
raise ValueError('Unexpected arguments: %s' % kwargs.keys())
|
||||
if not one_tag and not disamb_only:
|
||||
raise ValueError(
|
||||
'You cannot specify both one_tag=False and ' 'disamb_only=False'
|
||||
)
|
||||
if not tags and (simplify_tags or not one_tag or not disamb_only):
|
||||
raise ValueError(
|
||||
'You cannot specify simplify_tags, one_tag or '
|
||||
'disamb_only with functions other than tagged_*'
|
||||
)
|
||||
|
||||
return IPIPANCorpusView(
|
||||
filename,
|
||||
tags=tags,
|
||||
mode=mode,
|
||||
simplify_tags=simplify_tags,
|
||||
one_tag=one_tag,
|
||||
disamb_only=disamb_only,
|
||||
append_no_space=append_no_space,
|
||||
append_space=append_space,
|
||||
replace_xmlentities=replace_xmlentities,
|
||||
)
|
||||
|
||||
|
||||
class IPIPANCorpusView(StreamBackedCorpusView):
|
||||
|
||||
WORDS_MODE = 0
|
||||
SENTS_MODE = 1
|
||||
PARAS_MODE = 2
|
||||
|
||||
def __init__(self, filename, startpos=0, **kwargs):
|
||||
StreamBackedCorpusView.__init__(self, filename, None, startpos, None)
|
||||
self.in_sentence = False
|
||||
self.position = 0
|
||||
|
||||
self.show_tags = kwargs.pop('tags', True)
|
||||
self.disamb_only = kwargs.pop('disamb_only', True)
|
||||
self.mode = kwargs.pop('mode', IPIPANCorpusView.WORDS_MODE)
|
||||
self.simplify_tags = kwargs.pop('simplify_tags', False)
|
||||
self.one_tag = kwargs.pop('one_tag', True)
|
||||
self.append_no_space = kwargs.pop('append_no_space', False)
|
||||
self.append_space = kwargs.pop('append_space', False)
|
||||
self.replace_xmlentities = kwargs.pop('replace_xmlentities', True)
|
||||
|
||||
def read_block(self, stream):
|
||||
sentence = []
|
||||
sentences = []
|
||||
space = False
|
||||
no_space = False
|
||||
|
||||
tags = set()
|
||||
|
||||
lines = self._read_data(stream)
|
||||
|
||||
while True:
|
||||
|
||||
# we may have only part of last line
|
||||
if len(lines) <= 1:
|
||||
self._seek(stream)
|
||||
lines = self._read_data(stream)
|
||||
|
||||
if lines == ['']:
|
||||
assert not sentences
|
||||
return []
|
||||
|
||||
line = lines.pop()
|
||||
self.position += len(line) + 1
|
||||
|
||||
if line.startswith('<chunk type="s"'):
|
||||
self.in_sentence = True
|
||||
elif line.startswith('<chunk type="p"'):
|
||||
pass
|
||||
elif line.startswith('<tok'):
|
||||
if self.append_space and space and not no_space:
|
||||
self._append_space(sentence)
|
||||
space = True
|
||||
no_space = False
|
||||
orth = ""
|
||||
tags = set()
|
||||
elif line.startswith('</chunk'):
|
||||
if self.in_sentence:
|
||||
self.in_sentence = False
|
||||
self._seek(stream)
|
||||
if self.mode == self.SENTS_MODE:
|
||||
return [sentence]
|
||||
elif self.mode == self.WORDS_MODE:
|
||||
if self.append_space:
|
||||
self._append_space(sentence)
|
||||
return sentence
|
||||
else:
|
||||
sentences.append(sentence)
|
||||
elif self.mode == self.PARAS_MODE:
|
||||
self._seek(stream)
|
||||
return [sentences]
|
||||
elif line.startswith('<orth'):
|
||||
orth = line[6:-7]
|
||||
if self.replace_xmlentities:
|
||||
orth = orth.replace('"', '"').replace('&', '&')
|
||||
elif line.startswith('<lex'):
|
||||
if not self.disamb_only or line.find('disamb=') != -1:
|
||||
tag = line[line.index('<ctag') + 6 : line.index('</ctag')]
|
||||
tags.add(tag)
|
||||
elif line.startswith('</tok'):
|
||||
if self.show_tags:
|
||||
if self.simplify_tags:
|
||||
tags = [t.split(':')[0] for t in tags]
|
||||
if not self.one_tag or not self.disamb_only:
|
||||
sentence.append((orth, tuple(tags)))
|
||||
else:
|
||||
sentence.append((orth, tags.pop()))
|
||||
else:
|
||||
sentence.append(orth)
|
||||
elif line.startswith('<ns/>'):
|
||||
if self.append_space:
|
||||
no_space = True
|
||||
if self.append_no_space:
|
||||
if self.show_tags:
|
||||
sentence.append(('', 'no-space'))
|
||||
else:
|
||||
sentence.append('')
|
||||
elif line.startswith('</cesAna'):
|
||||
pass
|
||||
|
||||
def _read_data(self, stream):
|
||||
self.position = stream.tell()
|
||||
buff = stream.read(4096)
|
||||
lines = buff.split('\n')
|
||||
lines.reverse()
|
||||
return lines
|
||||
|
||||
def _seek(self, stream):
|
||||
stream.seek(self.position)
|
||||
|
||||
def _append_space(self, sentence):
|
||||
if self.show_tags:
|
||||
sentence.append((' ', 'space'))
|
||||
else:
|
||||
sentence.append(' ')
|
||||
194
venv/lib/python3.7/site-packages/nltk/corpus/reader/knbc.py
Normal file
194
venv/lib/python3.7/site-packages/nltk/corpus/reader/knbc.py
Normal file
@@ -0,0 +1,194 @@
|
||||
#! /usr/bin/env python
|
||||
# KNB Corpus reader
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Masato Hagiwara <hagisan@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
from six import string_types
|
||||
|
||||
from nltk.parse import DependencyGraph
|
||||
|
||||
from nltk.corpus.reader.util import (
|
||||
FileSystemPathPointer,
|
||||
find_corpus_fileids,
|
||||
read_blankline_block,
|
||||
)
|
||||
from nltk.corpus.reader.api import SyntaxCorpusReader, CorpusReader
|
||||
|
||||
# default function to convert morphlist to str for tree representation
|
||||
_morphs2str_default = lambda morphs: '/'.join(m[0] for m in morphs if m[0] != 'EOS')
|
||||
|
||||
|
||||
class KNBCorpusReader(SyntaxCorpusReader):
|
||||
"""
|
||||
This class implements:
|
||||
- ``__init__``, which specifies the location of the corpus
|
||||
and a method for detecting the sentence blocks in corpus files.
|
||||
- ``_read_block``, which reads a block from the input stream.
|
||||
- ``_word``, which takes a block and returns a list of list of words.
|
||||
- ``_tag``, which takes a block and returns a list of list of tagged
|
||||
words.
|
||||
- ``_parse``, which takes a block and returns a list of parsed
|
||||
sentences.
|
||||
|
||||
The structure of tagged words:
|
||||
tagged_word = (word(str), tags(tuple))
|
||||
tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)
|
||||
|
||||
Usage example
|
||||
-------------
|
||||
|
||||
>>> from nltk.corpus.util import LazyCorpusLoader
|
||||
>>> knbc = LazyCorpusLoader(
|
||||
... 'knbc/corpus1',
|
||||
... KNBCorpusReader,
|
||||
... r'.*/KN.*',
|
||||
... encoding='euc-jp',
|
||||
... )
|
||||
|
||||
>>> len(knbc.sents()[0])
|
||||
9
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default):
|
||||
"""
|
||||
Initialize KNBCorpusReader
|
||||
morphs2str is a function to convert morphlist to str for tree representation
|
||||
for _parse()
|
||||
"""
|
||||
# FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
|
||||
# from CorpusReader?
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self.morphs2str = morphs2str
|
||||
|
||||
def _read_block(self, stream):
|
||||
# blocks are split by blankline (or EOF) - default
|
||||
return read_blankline_block(stream)
|
||||
|
||||
def _word(self, t):
|
||||
res = []
|
||||
for line in t.splitlines():
|
||||
# ignore the Bunsets headers
|
||||
if not re.match(r"EOS|\*|\#|\+", line):
|
||||
cells = line.strip().split(" ")
|
||||
res.append(cells[0])
|
||||
|
||||
return res
|
||||
|
||||
# ignores tagset argument
|
||||
def _tag(self, t, tagset=None):
|
||||
res = []
|
||||
for line in t.splitlines():
|
||||
# ignore the Bunsets headers
|
||||
if not re.match(r"EOS|\*|\#|\+", line):
|
||||
cells = line.strip().split(" ")
|
||||
# convert cells to morph tuples
|
||||
res.append((cells[0], ' '.join(cells[1:])))
|
||||
|
||||
return res
|
||||
|
||||
def _parse(self, t):
|
||||
dg = DependencyGraph()
|
||||
i = 0
|
||||
for line in t.splitlines():
|
||||
if line[0] in '*+':
|
||||
# start of bunsetsu or tag
|
||||
|
||||
cells = line.strip().split(" ", 3)
|
||||
m = re.match(r"([\-0-9]*)([ADIP])", cells[1])
|
||||
|
||||
assert m is not None
|
||||
|
||||
node = dg.nodes[i]
|
||||
node.update({'address': i, 'rel': m.group(2), 'word': []})
|
||||
|
||||
dep_parent = int(m.group(1))
|
||||
|
||||
if dep_parent == -1:
|
||||
dg.root = node
|
||||
else:
|
||||
dg.nodes[dep_parent]['deps'].append(i)
|
||||
|
||||
i += 1
|
||||
elif line[0] != '#':
|
||||
# normal morph
|
||||
cells = line.strip().split(" ")
|
||||
# convert cells to morph tuples
|
||||
morph = cells[0], ' '.join(cells[1:])
|
||||
dg.nodes[i - 1]['word'].append(morph)
|
||||
|
||||
if self.morphs2str:
|
||||
for node in dg.nodes.values():
|
||||
node['word'] = self.morphs2str(node['word'])
|
||||
|
||||
return dg.tree()
|
||||
|
||||
|
||||
######################################################################
|
||||
# Demo
|
||||
######################################################################
|
||||
|
||||
|
||||
def demo():
|
||||
|
||||
import nltk
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
|
||||
root = nltk.data.find('corpora/knbc/corpus1')
|
||||
fileids = [
|
||||
f
|
||||
for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
|
||||
if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
|
||||
]
|
||||
|
||||
def _knbc_fileids_sort(x):
|
||||
cells = x.split('-')
|
||||
return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
|
||||
|
||||
knbc = LazyCorpusLoader(
|
||||
'knbc/corpus1',
|
||||
KNBCorpusReader,
|
||||
sorted(fileids, key=_knbc_fileids_sort),
|
||||
encoding='euc-jp',
|
||||
)
|
||||
|
||||
print(knbc.fileids()[:10])
|
||||
print(''.join(knbc.words()[:100]))
|
||||
|
||||
print('\n\n'.join(str(tree) for tree in knbc.parsed_sents()[:2]))
|
||||
|
||||
knbc.morphs2str = lambda morphs: '/'.join(
|
||||
"%s(%s)" % (m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
|
||||
).encode('utf-8')
|
||||
|
||||
print('\n\n'.join('%s' % tree for tree in knbc.parsed_sents()[:2]))
|
||||
|
||||
print(
|
||||
'\n'.join(
|
||||
' '.join("%s/%s" % (w[0], w[1].split(' ')[2]) for w in sent)
|
||||
for sent in knbc.tagged_sents()[0:2]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test():
|
||||
|
||||
from nltk.corpus.util import LazyCorpusLoader
|
||||
|
||||
knbc = LazyCorpusLoader(
|
||||
'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp'
|
||||
)
|
||||
assert isinstance(knbc.words()[0], string_types)
|
||||
assert isinstance(knbc.sents()[0][0], string_types)
|
||||
assert isinstance(knbc.tagged_words()[0], tuple)
|
||||
assert isinstance(knbc.tagged_sents()[0][0], tuple)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
demo()
|
||||
184
venv/lib/python3.7/site-packages/nltk/corpus/reader/lin.py
Normal file
184
venv/lib/python3.7/site-packages/nltk/corpus/reader/lin.py
Normal file
@@ -0,0 +1,184 @@
|
||||
# Natural Language Toolkit: Lin's Thesaurus
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Dan Blanchard <dblanchard@ets.org>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.txt
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from functools import reduce
|
||||
|
||||
from nltk.corpus.reader import CorpusReader
|
||||
|
||||
|
||||
class LinThesaurusCorpusReader(CorpusReader):
|
||||
""" Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin. """
|
||||
|
||||
# Compiled regular expression for extracting the key from the first line of each
|
||||
# thesaurus entry
|
||||
_key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+')
|
||||
|
||||
@staticmethod
|
||||
def __defaultdict_factory():
|
||||
''' Factory for creating defaultdict of defaultdict(dict)s '''
|
||||
return defaultdict(dict)
|
||||
|
||||
def __init__(self, root, badscore=0.0):
|
||||
'''
|
||||
Initialize the thesaurus.
|
||||
|
||||
:param root: root directory containing thesaurus LISP files
|
||||
:type root: C{string}
|
||||
:param badscore: the score to give to words which do not appear in each other's sets of synonyms
|
||||
:type badscore: C{float}
|
||||
'''
|
||||
|
||||
super(LinThesaurusCorpusReader, self).__init__(root, r'sim[A-Z]\.lsp')
|
||||
self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
|
||||
self._badscore = badscore
|
||||
for path, encoding, fileid in self.abspaths(
|
||||
include_encoding=True, include_fileid=True
|
||||
):
|
||||
with open(path) as lin_file:
|
||||
first = True
|
||||
for line in lin_file:
|
||||
line = line.strip()
|
||||
# Start of entry
|
||||
if first:
|
||||
key = LinThesaurusCorpusReader._key_re.sub(r'\1', line)
|
||||
first = False
|
||||
# End of entry
|
||||
elif line == '))':
|
||||
first = True
|
||||
# Lines with pairs of ngrams and scores
|
||||
else:
|
||||
split_line = line.split('\t')
|
||||
if len(split_line) == 2:
|
||||
ngram, score = split_line
|
||||
self._thesaurus[fileid][key][ngram.strip('"')] = float(
|
||||
score
|
||||
)
|
||||
|
||||
def similarity(self, ngram1, ngram2, fileid=None):
|
||||
'''
|
||||
Returns the similarity score for two ngrams.
|
||||
|
||||
:param ngram1: first ngram to compare
|
||||
:type ngram1: C{string}
|
||||
:param ngram2: second ngram to compare
|
||||
:type ngram2: C{string}
|
||||
:param fileid: thesaurus fileid to search in. If None, search all fileids.
|
||||
:type fileid: C{string}
|
||||
:return: If fileid is specified, just the score for the two ngrams; otherwise,
|
||||
list of tuples of fileids and scores.
|
||||
'''
|
||||
# Entries don't contain themselves, so make sure similarity between item and itself is 1.0
|
||||
if ngram1 == ngram2:
|
||||
if fileid:
|
||||
return 1.0
|
||||
else:
|
||||
return [(fid, 1.0) for fid in self._fileids]
|
||||
else:
|
||||
if fileid:
|
||||
return (
|
||||
self._thesaurus[fileid][ngram1][ngram2]
|
||||
if ngram2 in self._thesaurus[fileid][ngram1]
|
||||
else self._badscore
|
||||
)
|
||||
else:
|
||||
return [
|
||||
(
|
||||
fid,
|
||||
(
|
||||
self._thesaurus[fid][ngram1][ngram2]
|
||||
if ngram2 in self._thesaurus[fid][ngram1]
|
||||
else self._badscore
|
||||
),
|
||||
)
|
||||
for fid in self._fileids
|
||||
]
|
||||
|
||||
def scored_synonyms(self, ngram, fileid=None):
|
||||
'''
|
||||
Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram
|
||||
|
||||
:param ngram: ngram to lookup
|
||||
:type ngram: C{string}
|
||||
:param fileid: thesaurus fileid to search in. If None, search all fileids.
|
||||
:type fileid: C{string}
|
||||
:return: If fileid is specified, list of tuples of scores and synonyms; otherwise,
|
||||
list of tuples of fileids and lists, where inner lists consist of tuples of
|
||||
scores and synonyms.
|
||||
'''
|
||||
if fileid:
|
||||
return self._thesaurus[fileid][ngram].items()
|
||||
else:
|
||||
return [
|
||||
(fileid, self._thesaurus[fileid][ngram].items())
|
||||
for fileid in self._fileids
|
||||
]
|
||||
|
||||
def synonyms(self, ngram, fileid=None):
|
||||
'''
|
||||
Returns a list of synonyms for the current ngram.
|
||||
|
||||
:param ngram: ngram to lookup
|
||||
:type ngram: C{string}
|
||||
:param fileid: thesaurus fileid to search in. If None, search all fileids.
|
||||
:type fileid: C{string}
|
||||
:return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and
|
||||
lists, where inner lists contain synonyms.
|
||||
'''
|
||||
if fileid:
|
||||
return self._thesaurus[fileid][ngram].keys()
|
||||
else:
|
||||
return [
|
||||
(fileid, self._thesaurus[fileid][ngram].keys())
|
||||
for fileid in self._fileids
|
||||
]
|
||||
|
||||
def __contains__(self, ngram):
|
||||
'''
|
||||
Determines whether or not the given ngram is in the thesaurus.
|
||||
|
||||
:param ngram: ngram to lookup
|
||||
:type ngram: C{string}
|
||||
:return: whether the given ngram is in the thesaurus.
|
||||
'''
|
||||
return reduce(
|
||||
lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]),
|
||||
self._fileids,
|
||||
False,
|
||||
)
|
||||
|
||||
|
||||
######################################################################
|
||||
# Demo
|
||||
######################################################################
|
||||
|
||||
|
||||
def demo():
|
||||
from nltk.corpus import lin_thesaurus as thes
|
||||
|
||||
word1 = "business"
|
||||
word2 = "enterprise"
|
||||
print("Getting synonyms for " + word1)
|
||||
print(thes.synonyms(word1))
|
||||
|
||||
print("Getting scored synonyms for " + word1)
|
||||
print(thes.scored_synonyms(word1))
|
||||
|
||||
print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
|
||||
print(thes.synonyms(word1, fileid="simN.lsp"))
|
||||
|
||||
print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
|
||||
print(thes.synonyms(word1, fileid="simN.lsp"))
|
||||
|
||||
print("Similarity score for %s and %s:" % (word1, word2))
|
||||
print(thes.similarity(word1, word2))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
demo()
|
||||
414
venv/lib/python3.7/site-packages/nltk/corpus/reader/mte.py
Normal file
414
venv/lib/python3.7/site-packages/nltk/corpus/reader/mte.py
Normal file
@@ -0,0 +1,414 @@
|
||||
"""
|
||||
A reader for corpora whose documents are in MTE format.
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
from functools import reduce
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk.corpus.reader import concat, TaggedCorpusReader
|
||||
from nltk.corpus.reader.xmldocs import XMLCorpusView
|
||||
|
||||
|
||||
def xpath(root, path, ns):
|
||||
return root.findall(path, ns)
|
||||
|
||||
|
||||
class MTECorpusView(XMLCorpusView):
|
||||
"""
|
||||
Class for lazy viewing the MTE Corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, fileid, tagspec, elt_handler=None):
|
||||
XMLCorpusView.__init__(self, fileid, tagspec, elt_handler)
|
||||
|
||||
def read_block(self, stream, tagspec=None, elt_handler=None):
|
||||
return list(
|
||||
filter(
|
||||
lambda x: x is not None,
|
||||
XMLCorpusView.read_block(self, stream, tagspec, elt_handler),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class MTEFileReader:
|
||||
"""
|
||||
Class for loading the content of the multext-east corpus. It
|
||||
parses the xml files and does some tag-filtering depending on the
|
||||
given method parameters.
|
||||
"""
|
||||
|
||||
ns = {
|
||||
'tei': 'http://www.tei-c.org/ns/1.0',
|
||||
'xml': 'http://www.w3.org/XML/1998/namespace',
|
||||
}
|
||||
tag_ns = '{http://www.tei-c.org/ns/1.0}'
|
||||
xml_ns = '{http://www.w3.org/XML/1998/namespace}'
|
||||
word_path = "TEI/text/body/div/div/p/s/(w|c)"
|
||||
sent_path = "TEI/text/body/div/div/p/s"
|
||||
para_path = "TEI/text/body/div/div/p"
|
||||
|
||||
def __init__(self, file_path):
|
||||
self.__file_path = file_path
|
||||
|
||||
@classmethod
|
||||
def _word_elt(cls, elt, context):
|
||||
return elt.text
|
||||
|
||||
@classmethod
|
||||
def _sent_elt(cls, elt, context):
|
||||
return [cls._word_elt(w, None) for w in xpath(elt, '*', cls.ns)]
|
||||
|
||||
@classmethod
|
||||
def _para_elt(cls, elt, context):
|
||||
return [cls._sent_elt(s, None) for s in xpath(elt, '*', cls.ns)]
|
||||
|
||||
@classmethod
|
||||
def _tagged_word_elt(cls, elt, context):
|
||||
if 'ana' not in elt.attrib:
|
||||
return (elt.text, '')
|
||||
|
||||
if cls.__tags == "" and cls.__tagset == "msd":
|
||||
return (elt.text, elt.attrib['ana'])
|
||||
elif cls.__tags == "" and cls.__tagset == "universal":
|
||||
return (elt.text, MTETagConverter.msd_to_universal(elt.attrib['ana']))
|
||||
else:
|
||||
tags = re.compile('^' + re.sub("-", ".", cls.__tags) + '.*$')
|
||||
if tags.match(elt.attrib['ana']):
|
||||
if cls.__tagset == "msd":
|
||||
return (elt.text, elt.attrib['ana'])
|
||||
else:
|
||||
return (
|
||||
elt.text,
|
||||
MTETagConverter.msd_to_universal(elt.attrib['ana']),
|
||||
)
|
||||
else:
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def _tagged_sent_elt(cls, elt, context):
|
||||
return list(
|
||||
filter(
|
||||
lambda x: x is not None,
|
||||
[cls._tagged_word_elt(w, None) for w in xpath(elt, '*', cls.ns)],
|
||||
)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _tagged_para_elt(cls, elt, context):
|
||||
return list(
|
||||
filter(
|
||||
lambda x: x is not None,
|
||||
[cls._tagged_sent_elt(s, None) for s in xpath(elt, '*', cls.ns)],
|
||||
)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _lemma_word_elt(cls, elt, context):
|
||||
if 'lemma' not in elt.attrib:
|
||||
return (elt.text, '')
|
||||
else:
|
||||
return (elt.text, elt.attrib['lemma'])
|
||||
|
||||
@classmethod
|
||||
def _lemma_sent_elt(cls, elt, context):
|
||||
return [cls._lemma_word_elt(w, None) for w in xpath(elt, '*', cls.ns)]
|
||||
|
||||
@classmethod
|
||||
def _lemma_para_elt(cls, elt, context):
|
||||
return [cls._lemma_sent_elt(s, None) for s in xpath(elt, '*', cls.ns)]
|
||||
|
||||
def words(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.word_path, MTEFileReader._word_elt
|
||||
)
|
||||
|
||||
def sents(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.sent_path, MTEFileReader._sent_elt
|
||||
)
|
||||
|
||||
def paras(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.para_path, MTEFileReader._para_elt
|
||||
)
|
||||
|
||||
def lemma_words(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.word_path, MTEFileReader._lemma_word_elt
|
||||
)
|
||||
|
||||
def tagged_words(self, tagset, tags):
|
||||
MTEFileReader.__tagset = tagset
|
||||
MTEFileReader.__tags = tags
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.word_path, MTEFileReader._tagged_word_elt
|
||||
)
|
||||
|
||||
def lemma_sents(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.sent_path, MTEFileReader._lemma_sent_elt
|
||||
)
|
||||
|
||||
def tagged_sents(self, tagset, tags):
|
||||
MTEFileReader.__tagset = tagset
|
||||
MTEFileReader.__tags = tags
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.sent_path, MTEFileReader._tagged_sent_elt
|
||||
)
|
||||
|
||||
def lemma_paras(self):
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.para_path, MTEFileReader._lemma_para_elt
|
||||
)
|
||||
|
||||
def tagged_paras(self, tagset, tags):
|
||||
MTEFileReader.__tagset = tagset
|
||||
MTEFileReader.__tags = tags
|
||||
return MTECorpusView(
|
||||
self.__file_path, MTEFileReader.para_path, MTEFileReader._tagged_para_elt
|
||||
)
|
||||
|
||||
|
||||
class MTETagConverter:
|
||||
"""
|
||||
Class for converting msd tags to universal tags, more conversion
|
||||
options are currently not implemented.
|
||||
"""
|
||||
|
||||
mapping_msd_universal = {
|
||||
'A': 'ADJ',
|
||||
'S': 'ADP',
|
||||
'R': 'ADV',
|
||||
'C': 'CONJ',
|
||||
'D': 'DET',
|
||||
'N': 'NOUN',
|
||||
'M': 'NUM',
|
||||
'Q': 'PRT',
|
||||
'P': 'PRON',
|
||||
'V': 'VERB',
|
||||
'.': '.',
|
||||
'-': 'X',
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def msd_to_universal(tag):
|
||||
"""
|
||||
This function converts the annotation from the Multex-East to the universal tagset
|
||||
as described in Chapter 5 of the NLTK-Book
|
||||
|
||||
Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so
|
||||
"""
|
||||
indicator = tag[0] if not tag[0] == "#" else tag[1]
|
||||
|
||||
if not indicator in MTETagConverter.mapping_msd_universal:
|
||||
indicator = '-'
|
||||
|
||||
return MTETagConverter.mapping_msd_universal[indicator]
|
||||
|
||||
|
||||
class MTECorpusReader(TaggedCorpusReader):
|
||||
"""
|
||||
Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East.
|
||||
MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging
|
||||
scheme. These tags can be converted to the Universal tagset
|
||||
"""
|
||||
|
||||
def __init__(self, root=None, fileids=None, encoding='utf8'):
|
||||
"""
|
||||
Construct a new MTECorpusreader for a set of documents
|
||||
located at the given root directory. Example usage:
|
||||
|
||||
>>> root = '/...path to corpus.../'
|
||||
>>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP
|
||||
|
||||
:param root: The root directory for this corpus. (default points to location in multext config file)
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
|
||||
:param enconding: The encoding of the given files (default is utf8)
|
||||
"""
|
||||
TaggedCorpusReader.__init__(self, root, fileids, encoding)
|
||||
|
||||
def __fileids(self, fileids):
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
# filter wrong userinput
|
||||
fileids = filter(lambda x: x in self._fileids, fileids)
|
||||
# filter multext-east sourcefiles that are not compatible to the teip5 specification
|
||||
fileids = filter(lambda x: x not in ["oana-bg.xml", "oana-mk.xml"], fileids)
|
||||
if not fileids:
|
||||
print("No valid multext-east file specified")
|
||||
return fileids
|
||||
|
||||
def readme(self):
|
||||
"""
|
||||
Prints some information about this corpus.
|
||||
:return: the content of the attached README file
|
||||
:rtype: str
|
||||
"""
|
||||
return self.open("00README.txt").read()
|
||||
|
||||
def raw(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a single string.
|
||||
:rtype: str
|
||||
"""
|
||||
return reduce([self.open(f).read() for f in self.__fileids(fileids)], [])
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).words()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of sentences or utterances,
|
||||
each encoded as a list of word strings
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).sents()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of paragraphs, each encoded as a list
|
||||
of sentences, which are in turn encoded as lists of word string
|
||||
:rtype: list(list(list(str)))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).paras()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def lemma_words(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of words, the corresponding lemmas
|
||||
and punctuation symbols, encoded as tuples (word, lemma)
|
||||
:rtype: list(tuple(str,str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).lemma_words()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None, tagset="msd", tags=""):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:param tagset: The tagset that should be used in the returned object,
|
||||
either "universal" or "msd", "msd" is the default
|
||||
:param tags: An MSD Tag that is used to filter all parts of the used corpus
|
||||
that are not more precise or at least equal to the given tag
|
||||
:return: the given file(s) as a list of tagged words and punctuation symbols
|
||||
encoded as tuples (word, tag)
|
||||
:rtype: list(tuple(str, str))
|
||||
"""
|
||||
if tagset == "universal" or tagset == "msd":
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).tagged_words(
|
||||
tagset, tags
|
||||
)
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
else:
|
||||
print("Unknown tagset specified.")
|
||||
|
||||
def lemma_sents(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of sentences or utterances, each
|
||||
encoded as a list of tuples of the word and the corresponding
|
||||
lemma (word, lemma)
|
||||
:rtype: list(list(tuple(str, str)))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).lemma_sents()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, tagset="msd", tags=""):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:param tagset: The tagset that should be used in the returned object,
|
||||
either "universal" or "msd", "msd" is the default
|
||||
:param tags: An MSD Tag that is used to filter all parts of the used corpus
|
||||
that are not more precise or at least equal to the given tag
|
||||
:return: the given file(s) as a list of sentences or utterances, each
|
||||
each encoded as a list of (word,tag) tuples
|
||||
:rtype: list(list(tuple(str, str)))
|
||||
"""
|
||||
if tagset == "universal" or tagset == "msd":
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).tagged_sents(
|
||||
tagset, tags
|
||||
)
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
else:
|
||||
print("Unknown tagset specified.")
|
||||
|
||||
def lemma_paras(self, fileids=None):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:return: the given file(s) as a list of paragraphs, each encoded as a
|
||||
list of sentences, which are in turn encoded as a list of
|
||||
tuples of the word and the corresponding lemma (word, lemma)
|
||||
:rtype: list(List(List(tuple(str, str))))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).lemma_paras()
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_paras(self, fileids=None, tagset="msd", tags=""):
|
||||
"""
|
||||
:param fileids: A list specifying the fileids that should be used.
|
||||
:param tagset: The tagset that should be used in the returned object,
|
||||
either "universal" or "msd", "msd" is the default
|
||||
:param tags: An MSD Tag that is used to filter all parts of the used corpus
|
||||
that are not more precise or at least equal to the given tag
|
||||
:return: the given file(s) as a list of paragraphs, each encoded as a
|
||||
list of sentences, which are in turn encoded as a list
|
||||
of (word,tag) tuples
|
||||
:rtype: list(list(list(tuple(str, str))))
|
||||
"""
|
||||
if tagset == "universal" or tagset == "msd":
|
||||
return concat(
|
||||
[
|
||||
MTEFileReader(os.path.join(self._root, f)).tagged_paras(
|
||||
tagset, tags
|
||||
)
|
||||
for f in self.__fileids(fileids)
|
||||
]
|
||||
)
|
||||
else:
|
||||
print("Unknown tagset specified.")
|
||||
489
venv/lib/python3.7/site-packages/nltk/corpus/reader/nkjp.py
Normal file
489
venv/lib/python3.7/site-packages/nltk/corpus/reader/nkjp.py
Normal file
@@ -0,0 +1,489 @@
|
||||
# Natural Language Toolkit: NKJP Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Gabriela Kaczka
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import functools
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk.corpus.reader.util import concat
|
||||
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
|
||||
|
||||
|
||||
def _parse_args(fun):
|
||||
"""
|
||||
Wraps function arguments:
|
||||
if fileids not specified then function set NKJPCorpusReader paths.
|
||||
"""
|
||||
|
||||
@functools.wraps(fun)
|
||||
def decorator(self, fileids=None, **kwargs):
|
||||
if not fileids:
|
||||
fileids = self._paths
|
||||
return fun(self, fileids, **kwargs)
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
class NKJPCorpusReader(XMLCorpusReader):
|
||||
WORDS_MODE = 0
|
||||
SENTS_MODE = 1
|
||||
HEADER_MODE = 2
|
||||
RAW_MODE = 3
|
||||
|
||||
def __init__(self, root, fileids='.*'):
|
||||
"""
|
||||
Corpus reader designed to work with National Corpus of Polish.
|
||||
See http://nkjp.pl/ for more details about NKJP.
|
||||
use example:
|
||||
import nltk
|
||||
import nkjp
|
||||
from nkjp import NKJPCorpusReader
|
||||
x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus
|
||||
x.header()
|
||||
x.raw()
|
||||
x.words()
|
||||
x.tagged_words(tags=['subst', 'comp']) #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html
|
||||
x.sents()
|
||||
x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s)
|
||||
x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
|
||||
x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
|
||||
"""
|
||||
if isinstance(fileids, string_types):
|
||||
XMLCorpusReader.__init__(self, root, fileids + '.*/header.xml')
|
||||
else:
|
||||
XMLCorpusReader.__init__(
|
||||
self, root, [fileid + '/header.xml' for fileid in fileids]
|
||||
)
|
||||
self._paths = self.get_paths()
|
||||
|
||||
def get_paths(self):
|
||||
return [
|
||||
os.path.join(str(self._root), f.split("header.xml")[0])
|
||||
for f in self._fileids
|
||||
]
|
||||
|
||||
def fileids(self):
|
||||
"""
|
||||
Returns a list of file identifiers for the fileids that make up
|
||||
this corpus.
|
||||
"""
|
||||
return [f.split("header.xml")[0] for f in self._fileids]
|
||||
|
||||
def _view(self, filename, tags=None, **kwargs):
|
||||
"""
|
||||
Returns a view specialised for use with particular corpus file.
|
||||
"""
|
||||
mode = kwargs.pop('mode', NKJPCorpusReader.WORDS_MODE)
|
||||
if mode is NKJPCorpusReader.WORDS_MODE:
|
||||
return NKJPCorpus_Morph_View(filename, tags=tags)
|
||||
elif mode is NKJPCorpusReader.SENTS_MODE:
|
||||
return NKJPCorpus_Segmentation_View(filename, tags=tags)
|
||||
elif mode is NKJPCorpusReader.HEADER_MODE:
|
||||
return NKJPCorpus_Header_View(filename, tags=tags)
|
||||
elif mode is NKJPCorpusReader.RAW_MODE:
|
||||
return NKJPCorpus_Text_View(
|
||||
filename, tags=tags, mode=NKJPCorpus_Text_View.RAW_MODE
|
||||
)
|
||||
|
||||
else:
|
||||
raise NameError('No such mode!')
|
||||
|
||||
def add_root(self, fileid):
|
||||
"""
|
||||
Add root if necessary to specified fileid.
|
||||
"""
|
||||
if self.root in fileid:
|
||||
return fileid
|
||||
return self.root + fileid
|
||||
|
||||
@_parse_args
|
||||
def header(self, fileids=None, **kwargs):
|
||||
"""
|
||||
Returns header(s) of specified fileids.
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
self.add_root(fileid), mode=NKJPCorpusReader.HEADER_MODE, **kwargs
|
||||
).handle_query()
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def sents(self, fileids=None, **kwargs):
|
||||
"""
|
||||
Returns sentences in specified fileids.
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
self.add_root(fileid), mode=NKJPCorpusReader.SENTS_MODE, **kwargs
|
||||
).handle_query()
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def words(self, fileids=None, **kwargs):
|
||||
"""
|
||||
Returns words in specified fileids.
|
||||
"""
|
||||
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
self.add_root(fileid), mode=NKJPCorpusReader.WORDS_MODE, **kwargs
|
||||
).handle_query()
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def tagged_words(self, fileids=None, **kwargs):
|
||||
"""
|
||||
Call with specified tags as a list, e.g. tags=['subst', 'comp'].
|
||||
Returns tagged words in specified fileids.
|
||||
"""
|
||||
tags = kwargs.pop('tags', [])
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
self.add_root(fileid),
|
||||
mode=NKJPCorpusReader.WORDS_MODE,
|
||||
tags=tags,
|
||||
**kwargs
|
||||
).handle_query()
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
@_parse_args
|
||||
def raw(self, fileids=None, **kwargs):
|
||||
"""
|
||||
Returns words in specified fileids.
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self._view(
|
||||
self.add_root(fileid), mode=NKJPCorpusReader.RAW_MODE, **kwargs
|
||||
).handle_query()
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class NKJPCorpus_Header_View(XMLCorpusView):
|
||||
def __init__(self, filename, **kwargs):
|
||||
"""
|
||||
HEADER_MODE
|
||||
A stream backed corpus view specialized for use with
|
||||
header.xml files in NKJP corpus.
|
||||
"""
|
||||
self.tagspec = ".*/sourceDesc$"
|
||||
XMLCorpusView.__init__(self, filename + 'header.xml', self.tagspec)
|
||||
|
||||
def handle_query(self):
|
||||
self._open()
|
||||
header = []
|
||||
while True:
|
||||
segm = XMLCorpusView.read_block(self, self._stream)
|
||||
if len(segm) == 0:
|
||||
break
|
||||
header.extend(segm)
|
||||
self.close()
|
||||
return header
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
titles = elt.findall('bibl/title')
|
||||
title = []
|
||||
if titles:
|
||||
title = '\n'.join(title.text.strip() for title in titles)
|
||||
|
||||
authors = elt.findall('bibl/author')
|
||||
author = []
|
||||
if authors:
|
||||
author = '\n'.join(author.text.strip() for author in authors)
|
||||
|
||||
dates = elt.findall('bibl/date')
|
||||
date = []
|
||||
if dates:
|
||||
date = '\n'.join(date.text.strip() for date in dates)
|
||||
|
||||
publishers = elt.findall('bibl/publisher')
|
||||
publisher = []
|
||||
if publishers:
|
||||
publisher = '\n'.join(publisher.text.strip() for publisher in publishers)
|
||||
|
||||
idnos = elt.findall('bibl/idno')
|
||||
idno = []
|
||||
if idnos:
|
||||
idno = '\n'.join(idno.text.strip() for idno in idnos)
|
||||
|
||||
notes = elt.findall('bibl/note')
|
||||
note = []
|
||||
if notes:
|
||||
note = '\n'.join(note.text.strip() for note in notes)
|
||||
|
||||
return {
|
||||
'title': title,
|
||||
'author': author,
|
||||
'date': date,
|
||||
'publisher': publisher,
|
||||
'idno': idno,
|
||||
'note': note,
|
||||
}
|
||||
|
||||
|
||||
class XML_Tool:
|
||||
"""
|
||||
Helper class creating xml file to one without references to nkjp: namespace.
|
||||
That's needed because the XMLCorpusView assumes that one can find short substrings
|
||||
of XML that are valid XML, which is not true if a namespace is declared at top level
|
||||
"""
|
||||
|
||||
def __init__(self, root, filename):
|
||||
self.read_file = os.path.join(root, filename)
|
||||
self.write_file = tempfile.NamedTemporaryFile(delete=False)
|
||||
|
||||
def build_preprocessed_file(self):
|
||||
try:
|
||||
fr = open(self.read_file, 'r')
|
||||
fw = self.write_file
|
||||
line = ' '
|
||||
while len(line):
|
||||
line = fr.readline()
|
||||
x = re.split(r'nkjp:[^ ]* ', line) # in all files
|
||||
ret = ' '.join(x)
|
||||
x = re.split('<nkjp:paren>', ret) # in ann_segmentation.xml
|
||||
ret = ' '.join(x)
|
||||
x = re.split('</nkjp:paren>', ret) # in ann_segmentation.xml
|
||||
ret = ' '.join(x)
|
||||
x = re.split('<choice>', ret) # in ann_segmentation.xml
|
||||
ret = ' '.join(x)
|
||||
x = re.split('</choice>', ret) # in ann_segmentation.xml
|
||||
ret = ' '.join(x)
|
||||
fw.write(ret)
|
||||
fr.close()
|
||||
fw.close()
|
||||
return self.write_file.name
|
||||
except Exception:
|
||||
self.remove_preprocessed_file()
|
||||
raise Exception
|
||||
|
||||
def remove_preprocessed_file(self):
|
||||
os.remove(self.write_file.name)
|
||||
|
||||
|
||||
class NKJPCorpus_Segmentation_View(XMLCorpusView):
|
||||
"""
|
||||
A stream backed corpus view specialized for use with
|
||||
ann_segmentation.xml files in NKJP corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, filename, **kwargs):
|
||||
self.tagspec = '.*p/.*s'
|
||||
# intersperse NKJPCorpus_Text_View
|
||||
self.text_view = NKJPCorpus_Text_View(
|
||||
filename, mode=NKJPCorpus_Text_View.SENTS_MODE
|
||||
)
|
||||
self.text_view.handle_query()
|
||||
# xml preprocessing
|
||||
self.xml_tool = XML_Tool(filename, 'ann_segmentation.xml')
|
||||
# base class init
|
||||
XMLCorpusView.__init__(
|
||||
self, self.xml_tool.build_preprocessed_file(), self.tagspec
|
||||
)
|
||||
|
||||
def get_segm_id(self, example_word):
|
||||
return example_word.split('(')[1].split(',')[0]
|
||||
|
||||
def get_sent_beg(self, beg_word):
|
||||
# returns index of beginning letter in sentence
|
||||
return int(beg_word.split(',')[1])
|
||||
|
||||
def get_sent_end(self, end_word):
|
||||
# returns index of end letter in sentence
|
||||
splitted = end_word.split(')')[0].split(',')
|
||||
return int(splitted[1]) + int(splitted[2])
|
||||
|
||||
def get_sentences(self, sent_segm):
|
||||
# returns one sentence
|
||||
id = self.get_segm_id(sent_segm[0])
|
||||
segm = self.text_view.segm_dict[id] # text segment
|
||||
beg = self.get_sent_beg(sent_segm[0])
|
||||
end = self.get_sent_end(sent_segm[len(sent_segm) - 1])
|
||||
return segm[beg:end]
|
||||
|
||||
def remove_choice(self, segm):
|
||||
ret = []
|
||||
prev_txt_end = -1
|
||||
prev_txt_nr = -1
|
||||
for word in segm:
|
||||
txt_nr = self.get_segm_id(word)
|
||||
# get increasing sequence of ids: in case of choice get first possibility
|
||||
if self.get_sent_beg(word) > prev_txt_end - 1 or prev_txt_nr != txt_nr:
|
||||
ret.append(word)
|
||||
prev_txt_end = self.get_sent_end(word)
|
||||
prev_txt_nr = txt_nr
|
||||
|
||||
return ret
|
||||
|
||||
def handle_query(self):
|
||||
try:
|
||||
self._open()
|
||||
sentences = []
|
||||
while True:
|
||||
sent_segm = XMLCorpusView.read_block(self, self._stream)
|
||||
if len(sent_segm) == 0:
|
||||
break
|
||||
for segm in sent_segm:
|
||||
segm = self.remove_choice(segm)
|
||||
sentences.append(self.get_sentences(segm))
|
||||
self.close()
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
return sentences
|
||||
except Exception:
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
raise Exception
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
ret = []
|
||||
for seg in elt:
|
||||
ret.append(seg.get('corresp'))
|
||||
return ret
|
||||
|
||||
|
||||
class NKJPCorpus_Text_View(XMLCorpusView):
|
||||
"""
|
||||
A stream backed corpus view specialized for use with
|
||||
text.xml files in NKJP corpus.
|
||||
"""
|
||||
|
||||
SENTS_MODE = 0
|
||||
RAW_MODE = 1
|
||||
|
||||
def __init__(self, filename, **kwargs):
|
||||
self.mode = kwargs.pop('mode', 0)
|
||||
self.tagspec = '.*/div/ab'
|
||||
self.segm_dict = dict()
|
||||
# xml preprocessing
|
||||
self.xml_tool = XML_Tool(filename, 'text.xml')
|
||||
# base class init
|
||||
XMLCorpusView.__init__(
|
||||
self, self.xml_tool.build_preprocessed_file(), self.tagspec
|
||||
)
|
||||
|
||||
def handle_query(self):
|
||||
try:
|
||||
self._open()
|
||||
x = self.read_block(self._stream)
|
||||
self.close()
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
return x
|
||||
except Exception:
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
raise Exception
|
||||
|
||||
def read_block(self, stream, tagspec=None, elt_handler=None):
|
||||
"""
|
||||
Returns text as a list of sentences.
|
||||
"""
|
||||
txt = []
|
||||
while True:
|
||||
segm = XMLCorpusView.read_block(self, stream)
|
||||
if len(segm) == 0:
|
||||
break
|
||||
for part in segm:
|
||||
txt.append(part)
|
||||
|
||||
return [' '.join([segm for segm in txt])]
|
||||
|
||||
def get_segm_id(self, elt):
|
||||
for attr in elt.attrib:
|
||||
if attr.endswith('id'):
|
||||
return elt.get(attr)
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
# fill dictionary to use later in sents mode
|
||||
if self.mode is NKJPCorpus_Text_View.SENTS_MODE:
|
||||
self.segm_dict[self.get_segm_id(elt)] = elt.text
|
||||
return elt.text
|
||||
|
||||
|
||||
class NKJPCorpus_Morph_View(XMLCorpusView):
|
||||
"""
|
||||
A stream backed corpus view specialized for use with
|
||||
ann_morphosyntax.xml files in NKJP corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, filename, **kwargs):
|
||||
self.tags = kwargs.pop('tags', None)
|
||||
self.tagspec = '.*/seg/fs'
|
||||
self.xml_tool = XML_Tool(filename, 'ann_morphosyntax.xml')
|
||||
XMLCorpusView.__init__(
|
||||
self, self.xml_tool.build_preprocessed_file(), self.tagspec
|
||||
)
|
||||
|
||||
def handle_query(self):
|
||||
try:
|
||||
self._open()
|
||||
words = []
|
||||
while True:
|
||||
segm = XMLCorpusView.read_block(self, self._stream)
|
||||
if len(segm) == 0:
|
||||
break
|
||||
for part in segm:
|
||||
if part is not None:
|
||||
words.append(part)
|
||||
self.close()
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
return words
|
||||
except Exception:
|
||||
self.xml_tool.remove_preprocessed_file()
|
||||
raise Exception
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
word = ''
|
||||
flag = False
|
||||
is_not_interp = True
|
||||
# if tags not specified, then always return word
|
||||
if self.tags is None:
|
||||
flag = True
|
||||
|
||||
for child in elt:
|
||||
|
||||
# get word
|
||||
if 'name' in child.keys() and child.attrib['name'] == 'orth':
|
||||
for symbol in child:
|
||||
if symbol.tag == 'string':
|
||||
word = symbol.text
|
||||
elif 'name' in child.keys() and child.attrib['name'] == 'interps':
|
||||
for symbol in child:
|
||||
if 'type' in symbol.keys() and symbol.attrib['type'] == 'lex':
|
||||
for symbol2 in symbol:
|
||||
if (
|
||||
'name' in symbol2.keys()
|
||||
and symbol2.attrib['name'] == 'ctag'
|
||||
):
|
||||
for symbol3 in symbol2:
|
||||
if (
|
||||
'value' in symbol3.keys()
|
||||
and self.tags is not None
|
||||
and symbol3.attrib['value'] in self.tags
|
||||
):
|
||||
flag = True
|
||||
elif (
|
||||
'value' in symbol3.keys()
|
||||
and symbol3.attrib['value'] == 'interp'
|
||||
):
|
||||
is_not_interp = False
|
||||
if flag and is_not_interp:
|
||||
return word
|
||||
485
venv/lib/python3.7/site-packages/nltk/corpus/reader/nombank.py
Normal file
485
venv/lib/python3.7/site-packages/nltk/corpus/reader/nombank.py
Normal file
@@ -0,0 +1,485 @@
|
||||
# Natural Language Toolkit: NomBank Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Authors: Paul Bedaride <paul.bedaride@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from xml.etree import ElementTree
|
||||
from functools import total_ordering
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk.tree import Tree
|
||||
from nltk.internals import raise_unorderable_types
|
||||
from nltk.compat import python_2_unicode_compatible
|
||||
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
|
||||
class NombankCorpusReader(CorpusReader):
|
||||
"""
|
||||
Corpus reader for the nombank corpus, which augments the Penn
|
||||
Treebank with information about the predicate argument structure
|
||||
of every noun instance. The corpus consists of two parts: the
|
||||
predicate-argument annotations themselves, and a set of "frameset
|
||||
files" which define the argument labels used by the annotations,
|
||||
on a per-noun basis. Each "frameset file" contains one or more
|
||||
predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
|
||||
divided into coarse-grained word senses called "rolesets". For
|
||||
each "roleset", the frameset file provides descriptions of the
|
||||
argument roles, along with examples.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
nomfile,
|
||||
framefiles='',
|
||||
nounsfile=None,
|
||||
parse_fileid_xform=None,
|
||||
parse_corpus=None,
|
||||
encoding='utf8',
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param nomfile: The name of the file containing the predicate-
|
||||
argument annotations (relative to ``root``).
|
||||
:param framefiles: A list or regexp specifying the frameset
|
||||
fileids for this corpus.
|
||||
:param parse_fileid_xform: A transform that should be applied
|
||||
to the fileids in this corpus. This should be a function
|
||||
of one argument (a fileid) that returns a string (the new
|
||||
fileid).
|
||||
:param parse_corpus: The corpus containing the parse trees
|
||||
corresponding to this corpus. These parse trees are
|
||||
necessary to resolve the tree pointers used by nombank.
|
||||
"""
|
||||
|
||||
# If framefiles is specified as a regexp, expand it.
|
||||
if isinstance(framefiles, string_types):
|
||||
self._fileids = find_corpus_fileids(root, framefiles)
|
||||
self._fileids = list(framefiles)
|
||||
# Initialze the corpus reader.
|
||||
CorpusReader.__init__(self, root, framefiles, encoding)
|
||||
|
||||
# Record our nom file & nouns file.
|
||||
self._nomfile = nomfile
|
||||
self._nounsfile = nounsfile
|
||||
self._parse_fileid_xform = parse_fileid_xform
|
||||
self._parse_corpus = parse_corpus
|
||||
|
||||
def raw(self, fileids=None):
|
||||
"""
|
||||
:return: the text contents of the given fileids, as a single string.
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
def instances(self, baseform=None):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of
|
||||
``NombankInstance`` objects, one for each noun in the corpus.
|
||||
"""
|
||||
kwargs = {}
|
||||
if baseform is not None:
|
||||
kwargs['instance_filter'] = lambda inst: inst.baseform == baseform
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._nomfile),
|
||||
lambda stream: self._read_instance_block(stream, **kwargs),
|
||||
encoding=self.encoding(self._nomfile),
|
||||
)
|
||||
|
||||
def lines(self):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of strings, one for
|
||||
each line in the predicate-argument annotation file.
|
||||
"""
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._nomfile),
|
||||
read_line_block,
|
||||
encoding=self.encoding(self._nomfile),
|
||||
)
|
||||
|
||||
def roleset(self, roleset_id):
|
||||
"""
|
||||
:return: the xml description for the given roleset.
|
||||
"""
|
||||
baseform = roleset_id.split('.')[0]
|
||||
baseform = baseform.replace('perc-sign', '%')
|
||||
baseform = baseform.replace('oneslashonezero', '1/10').replace(
|
||||
'1/10', '1-slash-10'
|
||||
)
|
||||
framefile = 'frames/%s.xml' % baseform
|
||||
if framefile not in self.fileids():
|
||||
raise ValueError('Frameset file for %s not found' % roleset_id)
|
||||
|
||||
# n.b.: The encoding for XML fileids is specified by the file
|
||||
# itself; so we ignore self._encoding here.
|
||||
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
|
||||
for roleset in etree.findall('predicate/roleset'):
|
||||
if roleset.attrib['id'] == roleset_id:
|
||||
return roleset
|
||||
raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
|
||||
|
||||
def rolesets(self, baseform=None):
|
||||
"""
|
||||
:return: list of xml descriptions for rolesets.
|
||||
"""
|
||||
if baseform is not None:
|
||||
framefile = 'frames/%s.xml' % baseform
|
||||
if framefile not in self.fileids():
|
||||
raise ValueError('Frameset file for %s not found' % baseform)
|
||||
framefiles = [framefile]
|
||||
else:
|
||||
framefiles = self.fileids()
|
||||
|
||||
rsets = []
|
||||
for framefile in framefiles:
|
||||
# n.b.: The encoding for XML fileids is specified by the file
|
||||
# itself; so we ignore self._encoding here.
|
||||
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
|
||||
rsets.append(etree.findall('predicate/roleset'))
|
||||
return LazyConcatenation(rsets)
|
||||
|
||||
def nouns(self):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of all noun lemmas
|
||||
in this corpus (from the nombank.1.0.words file).
|
||||
"""
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._nounsfile),
|
||||
read_line_block,
|
||||
encoding=self.encoding(self._nounsfile),
|
||||
)
|
||||
|
||||
def _read_instance_block(self, stream, instance_filter=lambda inst: True):
|
||||
block = []
|
||||
|
||||
# Read 100 at a time.
|
||||
for i in range(100):
|
||||
line = stream.readline().strip()
|
||||
if line:
|
||||
inst = NombankInstance.parse(
|
||||
line, self._parse_fileid_xform, self._parse_corpus
|
||||
)
|
||||
if instance_filter(inst):
|
||||
block.append(inst)
|
||||
|
||||
return block
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Nombank Instance & related datatypes
|
||||
######################################################################
|
||||
|
||||
|
||||
@python_2_unicode_compatible
|
||||
class NombankInstance(object):
|
||||
def __init__(
|
||||
self,
|
||||
fileid,
|
||||
sentnum,
|
||||
wordnum,
|
||||
baseform,
|
||||
sensenumber,
|
||||
predicate,
|
||||
predid,
|
||||
arguments,
|
||||
parse_corpus=None,
|
||||
):
|
||||
|
||||
self.fileid = fileid
|
||||
"""The name of the file containing the parse tree for this
|
||||
instance's sentence."""
|
||||
|
||||
self.sentnum = sentnum
|
||||
"""The sentence number of this sentence within ``fileid``.
|
||||
Indexing starts from zero."""
|
||||
|
||||
self.wordnum = wordnum
|
||||
"""The word number of this instance's predicate within its
|
||||
containing sentence. Word numbers are indexed starting from
|
||||
zero, and include traces and other empty parse elements."""
|
||||
|
||||
self.baseform = baseform
|
||||
"""The baseform of the predicate."""
|
||||
|
||||
self.sensenumber = sensenumber
|
||||
"""The sense number of the predicate."""
|
||||
|
||||
self.predicate = predicate
|
||||
"""A ``NombankTreePointer`` indicating the position of this
|
||||
instance's predicate within its containing sentence."""
|
||||
|
||||
self.predid = predid
|
||||
"""Identifier of the predicate."""
|
||||
|
||||
self.arguments = tuple(arguments)
|
||||
"""A list of tuples (argloc, argid), specifying the location
|
||||
and identifier for each of the predicate's argument in the
|
||||
containing sentence. Argument identifiers are strings such as
|
||||
``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
|
||||
the predicate."""
|
||||
|
||||
self.parse_corpus = parse_corpus
|
||||
"""A corpus reader for the parse trees corresponding to the
|
||||
instances in this nombank corpus."""
|
||||
|
||||
@property
|
||||
def roleset(self):
|
||||
"""The name of the roleset used by this instance's predicate.
|
||||
Use ``nombank.roleset() <NombankCorpusReader.roleset>`` to
|
||||
look up information about the roleset."""
|
||||
r = self.baseform.replace('%', 'perc-sign')
|
||||
r = r.replace('1/10', '1-slash-10').replace('1-slash-10', 'oneslashonezero')
|
||||
return '%s.%s' % (r, self.sensenumber)
|
||||
|
||||
def __repr__(self):
|
||||
return '<NombankInstance: %s, sent %s, word %s>' % (
|
||||
self.fileid,
|
||||
self.sentnum,
|
||||
self.wordnum,
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
s = '%s %s %s %s %s' % (
|
||||
self.fileid,
|
||||
self.sentnum,
|
||||
self.wordnum,
|
||||
self.baseform,
|
||||
self.sensenumber,
|
||||
)
|
||||
items = self.arguments + ((self.predicate, 'rel'),)
|
||||
for (argloc, argid) in sorted(items):
|
||||
s += ' %s-%s' % (argloc, argid)
|
||||
return s
|
||||
|
||||
def _get_tree(self):
|
||||
if self.parse_corpus is None:
|
||||
return None
|
||||
if self.fileid not in self.parse_corpus.fileids():
|
||||
return None
|
||||
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
|
||||
|
||||
tree = property(
|
||||
_get_tree,
|
||||
doc="""
|
||||
The parse tree corresponding to this instance, or None if
|
||||
the corresponding tree is not available.""",
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def parse(s, parse_fileid_xform=None, parse_corpus=None):
|
||||
pieces = s.split()
|
||||
if len(pieces) < 6:
|
||||
raise ValueError('Badly formatted nombank line: %r' % s)
|
||||
|
||||
# Divide the line into its basic pieces.
|
||||
(fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5]
|
||||
|
||||
args = pieces[5:]
|
||||
rel = [args.pop(i) for i, p in enumerate(args) if '-rel' in p]
|
||||
if len(rel) != 1:
|
||||
raise ValueError('Badly formatted nombank line: %r' % s)
|
||||
|
||||
# Apply the fileid selector, if any.
|
||||
if parse_fileid_xform is not None:
|
||||
fileid = parse_fileid_xform(fileid)
|
||||
|
||||
# Convert sentence & word numbers to ints.
|
||||
sentnum = int(sentnum)
|
||||
wordnum = int(wordnum)
|
||||
|
||||
# Parse the predicate location.
|
||||
|
||||
predloc, predid = rel[0].split('-', 1)
|
||||
predicate = NombankTreePointer.parse(predloc)
|
||||
|
||||
# Parse the arguments.
|
||||
arguments = []
|
||||
for arg in args:
|
||||
argloc, argid = arg.split('-', 1)
|
||||
arguments.append((NombankTreePointer.parse(argloc), argid))
|
||||
|
||||
# Put it all together.
|
||||
return NombankInstance(
|
||||
fileid,
|
||||
sentnum,
|
||||
wordnum,
|
||||
baseform,
|
||||
sensenumber,
|
||||
predicate,
|
||||
predid,
|
||||
arguments,
|
||||
parse_corpus,
|
||||
)
|
||||
|
||||
|
||||
class NombankPointer(object):
|
||||
"""
|
||||
A pointer used by nombank to identify one or more constituents in
|
||||
a parse tree. ``NombankPointer`` is an abstract base class with
|
||||
three concrete subclasses:
|
||||
|
||||
- ``NombankTreePointer`` is used to point to single constituents.
|
||||
- ``NombankSplitTreePointer`` is used to point to 'split'
|
||||
constituents, which consist of a sequence of two or more
|
||||
``NombankTreePointer`` pointers.
|
||||
- ``NombankChainTreePointer`` is used to point to entire trace
|
||||
chains in a tree. It consists of a sequence of pieces, which
|
||||
can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
if self.__class__ == NombankPointer:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
@python_2_unicode_compatible
|
||||
class NombankChainTreePointer(NombankPointer):
|
||||
def __init__(self, pieces):
|
||||
self.pieces = pieces
|
||||
"""A list of the pieces that make up this chain. Elements may
|
||||
be either ``NombankSplitTreePointer`` or
|
||||
``NombankTreePointer`` pointers."""
|
||||
|
||||
def __str__(self):
|
||||
return '*'.join('%s' % p for p in self.pieces)
|
||||
|
||||
def __repr__(self):
|
||||
return '<NombankChainTreePointer: %s>' % self
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError('Parse tree not avaialable')
|
||||
return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
|
||||
|
||||
|
||||
@python_2_unicode_compatible
|
||||
class NombankSplitTreePointer(NombankPointer):
|
||||
def __init__(self, pieces):
|
||||
self.pieces = pieces
|
||||
"""A list of the pieces that make up this chain. Elements are
|
||||
all ``NombankTreePointer`` pointers."""
|
||||
|
||||
def __str__(self):
|
||||
return ','.join('%s' % p for p in self.pieces)
|
||||
|
||||
def __repr__(self):
|
||||
return '<NombankSplitTreePointer: %s>' % self
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError('Parse tree not avaialable')
|
||||
return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
|
||||
|
||||
|
||||
@total_ordering
|
||||
@python_2_unicode_compatible
|
||||
class NombankTreePointer(NombankPointer):
|
||||
"""
|
||||
wordnum:height*wordnum:height*...
|
||||
wordnum:height,
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, wordnum, height):
|
||||
self.wordnum = wordnum
|
||||
self.height = height
|
||||
|
||||
@staticmethod
|
||||
def parse(s):
|
||||
# Deal with chains (xx*yy*zz)
|
||||
pieces = s.split('*')
|
||||
if len(pieces) > 1:
|
||||
return NombankChainTreePointer(
|
||||
[NombankTreePointer.parse(elt) for elt in pieces]
|
||||
)
|
||||
|
||||
# Deal with split args (xx,yy,zz)
|
||||
pieces = s.split(',')
|
||||
if len(pieces) > 1:
|
||||
return NombankSplitTreePointer(
|
||||
[NombankTreePointer.parse(elt) for elt in pieces]
|
||||
)
|
||||
|
||||
# Deal with normal pointers.
|
||||
pieces = s.split(':')
|
||||
if len(pieces) != 2:
|
||||
raise ValueError('bad nombank pointer %r' % s)
|
||||
return NombankTreePointer(int(pieces[0]), int(pieces[1]))
|
||||
|
||||
def __str__(self):
|
||||
return '%s:%s' % (self.wordnum, self.height)
|
||||
|
||||
def __repr__(self):
|
||||
return 'NombankTreePointer(%d, %d)' % (self.wordnum, self.height)
|
||||
|
||||
def __eq__(self, other):
|
||||
while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
|
||||
other = other.pieces[0]
|
||||
|
||||
if not isinstance(other, NombankTreePointer):
|
||||
return self is other
|
||||
|
||||
return self.wordnum == other.wordnum and self.height == other.height
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __lt__(self, other):
|
||||
while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
|
||||
other = other.pieces[0]
|
||||
|
||||
if not isinstance(other, NombankTreePointer):
|
||||
return id(self) < id(other)
|
||||
|
||||
return (self.wordnum, -self.height) < (other.wordnum, -other.height)
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError('Parse tree not avaialable')
|
||||
return tree[self.treepos(tree)]
|
||||
|
||||
def treepos(self, tree):
|
||||
"""
|
||||
Convert this pointer to a standard 'tree position' pointer,
|
||||
given that it points to the given tree.
|
||||
"""
|
||||
if tree is None:
|
||||
raise ValueError('Parse tree not avaialable')
|
||||
stack = [tree]
|
||||
treepos = []
|
||||
|
||||
wordnum = 0
|
||||
while True:
|
||||
# print treepos
|
||||
# print stack[-1]
|
||||
# tree node:
|
||||
if isinstance(stack[-1], Tree):
|
||||
# Select the next child.
|
||||
if len(treepos) < len(stack):
|
||||
treepos.append(0)
|
||||
else:
|
||||
treepos[-1] += 1
|
||||
# Update the stack.
|
||||
if treepos[-1] < len(stack[-1]):
|
||||
stack.append(stack[-1][treepos[-1]])
|
||||
else:
|
||||
# End of node's child list: pop up a level.
|
||||
stack.pop()
|
||||
treepos.pop()
|
||||
# word node:
|
||||
else:
|
||||
if wordnum == self.wordnum:
|
||||
return tuple(treepos[: len(treepos) - self.height - 1])
|
||||
else:
|
||||
wordnum += 1
|
||||
stack.pop()
|
||||
@@ -0,0 +1,92 @@
|
||||
# Natural Language Toolkit: NPS Chat Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
import textwrap
|
||||
|
||||
from nltk.util import LazyConcatenation
|
||||
from nltk.internals import ElementWrapper
|
||||
from nltk.tag import map_tag
|
||||
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.xmldocs import *
|
||||
|
||||
|
||||
class NPSChatCorpusReader(XMLCorpusReader):
|
||||
def __init__(self, root, fileids, wrap_etree=False, tagset=None):
|
||||
XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
|
||||
self._tagset = tagset
|
||||
|
||||
def xml_posts(self, fileids=None):
|
||||
if self._wrap_etree:
|
||||
return concat(
|
||||
[
|
||||
XMLCorpusView(fileid, 'Session/Posts/Post', self._wrap_elt)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
XMLCorpusView(fileid, 'Session/Posts/Post')
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def posts(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
XMLCorpusView(
|
||||
fileid, 'Session/Posts/Post/terminals', self._elt_to_words
|
||||
)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_posts(self, fileids=None, tagset=None):
|
||||
def reader(elt, handler):
|
||||
return self._elt_to_tagged_words(elt, handler, tagset)
|
||||
|
||||
return concat(
|
||||
[
|
||||
XMLCorpusView(fileid, 'Session/Posts/Post/terminals', reader)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None):
|
||||
return LazyConcatenation(self.posts(fileids))
|
||||
|
||||
def tagged_words(self, fileids=None, tagset=None):
|
||||
return LazyConcatenation(self.tagged_posts(fileids, tagset))
|
||||
|
||||
def _wrap_elt(self, elt, handler):
|
||||
return ElementWrapper(elt)
|
||||
|
||||
def _elt_to_words(self, elt, handler):
|
||||
return [self._simplify_username(t.attrib['word']) for t in elt.findall('t')]
|
||||
|
||||
def _elt_to_tagged_words(self, elt, handler, tagset=None):
|
||||
tagged_post = [
|
||||
(self._simplify_username(t.attrib['word']), t.attrib['pos'])
|
||||
for t in elt.findall('t')
|
||||
]
|
||||
if tagset and tagset != self._tagset:
|
||||
tagged_post = [
|
||||
(w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post
|
||||
]
|
||||
return tagged_post
|
||||
|
||||
@staticmethod
|
||||
def _simplify_username(word):
|
||||
if 'User' in word:
|
||||
word = 'U' + word.split('User', 1)[1]
|
||||
elif isinstance(word, bytes):
|
||||
word = word.decode('ascii')
|
||||
return word
|
||||
@@ -0,0 +1,123 @@
|
||||
# Natural Language Toolkit: Opinion Lexicon Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader for the Opinion Lexicon.
|
||||
|
||||
- Opinion Lexicon information -
|
||||
Authors: Minqing Hu and Bing Liu, 2004.
|
||||
Department of Computer Sicence
|
||||
University of Illinois at Chicago
|
||||
|
||||
Contact: Bing Liu, liub@cs.uic.edu
|
||||
http://www.cs.uic.edu/~liub
|
||||
|
||||
Distributed with permission.
|
||||
|
||||
Related papers:
|
||||
- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
|
||||
Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery
|
||||
& Data Mining (KDD-04), Aug 22-25, 2004, Seattle, Washington, USA.
|
||||
|
||||
- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and
|
||||
Comparing Opinions on the Web". Proceedings of the 14th International World
|
||||
Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan.
|
||||
"""
|
||||
from six import string_types
|
||||
|
||||
from nltk.corpus.reader import WordListCorpusReader
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
|
||||
class IgnoreReadmeCorpusView(StreamBackedCorpusView):
|
||||
"""
|
||||
This CorpusView is used to skip the initial readme block of the corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
StreamBackedCorpusView.__init__(self, *args, **kwargs)
|
||||
# open self._stream
|
||||
self._open()
|
||||
# skip the readme block
|
||||
read_blankline_block(self._stream)
|
||||
# Set the initial position to the current stream position
|
||||
self._filepos = [self._stream.tell()]
|
||||
|
||||
|
||||
class OpinionLexiconCorpusReader(WordListCorpusReader):
|
||||
"""
|
||||
Reader for Liu and Hu opinion lexicon. Blank lines and readme are ignored.
|
||||
|
||||
>>> from nltk.corpus import opinion_lexicon
|
||||
>>> opinion_lexicon.words()
|
||||
['2-faced', '2-faces', 'abnormal', 'abolish', ...]
|
||||
|
||||
The OpinionLexiconCorpusReader provides shortcuts to retrieve positive/negative
|
||||
words:
|
||||
|
||||
>>> opinion_lexicon.negative()
|
||||
['2-faced', '2-faces', 'abnormal', 'abolish', ...]
|
||||
|
||||
Note that words from `words()` method are sorted by file id, not alphabetically:
|
||||
|
||||
>>> opinion_lexicon.words()[0:10]
|
||||
['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably',
|
||||
'abominate', 'abomination', 'abort', 'aborted']
|
||||
>>> sorted(opinion_lexicon.words())[0:10]
|
||||
['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably',
|
||||
'abominate', 'abomination', 'abort']
|
||||
"""
|
||||
|
||||
CorpusView = IgnoreReadmeCorpusView
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
Return all words in the opinion lexicon. Note that these words are not
|
||||
sorted in alphabetical order.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
words have to be returned.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def positive(self):
|
||||
"""
|
||||
Return all positive words in alphabetical order.
|
||||
|
||||
:return: a list of positive words.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return self.words('positive-words.txt')
|
||||
|
||||
def negative(self):
|
||||
"""
|
||||
Return all negative words in alphabetical order.
|
||||
|
||||
:return: a list of negative words.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return self.words('negative-words.txt')
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
continue
|
||||
words.append(line.strip())
|
||||
return words
|
||||
@@ -0,0 +1,174 @@
|
||||
# Natural Language Toolkit: PanLex Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: David Kamholz <kamholz@panlex.org>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader for PanLex Lite, a stripped down version of PanLex distributed
|
||||
as an SQLite database. See the README.txt in the panlex_lite corpus directory
|
||||
for more information on PanLex Lite.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
|
||||
from nltk.corpus.reader.api import CorpusReader
|
||||
|
||||
|
||||
class PanLexLiteCorpusReader(CorpusReader):
|
||||
MEANING_Q = """
|
||||
SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv
|
||||
FROM dnx
|
||||
JOIN ex ON (ex.ex = dnx.ex)
|
||||
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
|
||||
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
|
||||
WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ?
|
||||
ORDER BY dnx2.uq DESC
|
||||
"""
|
||||
|
||||
TRANSLATION_Q = """
|
||||
SELECT s.tt, sum(s.uq) AS trq FROM (
|
||||
SELECT ex2.tt, max(dnx.uq) AS uq
|
||||
FROM dnx
|
||||
JOIN ex ON (ex.ex = dnx.ex)
|
||||
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
|
||||
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
|
||||
WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ?
|
||||
GROUP BY ex2.tt, dnx.ui
|
||||
) s
|
||||
GROUP BY s.tt
|
||||
ORDER BY trq DESC, s.tt
|
||||
"""
|
||||
|
||||
def __init__(self, root):
|
||||
self._c = sqlite3.connect(os.path.join(root, 'db.sqlite')).cursor()
|
||||
|
||||
self._uid_lv = {}
|
||||
self._lv_uid = {}
|
||||
|
||||
for row in self._c.execute('SELECT uid, lv FROM lv'):
|
||||
self._uid_lv[row[0]] = row[1]
|
||||
self._lv_uid[row[1]] = row[0]
|
||||
|
||||
def language_varieties(self, lc=None):
|
||||
"""
|
||||
Return a list of PanLex language varieties.
|
||||
|
||||
:param lc: ISO 639 alpha-3 code. If specified, filters returned varieties
|
||||
by this code. If unspecified, all varieties are returned.
|
||||
:return: the specified language varieties as a list of tuples. The first
|
||||
element is the language variety's seven-character uniform identifier,
|
||||
and the second element is its default name.
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
|
||||
if lc is None:
|
||||
return self._c.execute('SELECT uid, tt FROM lv ORDER BY uid').fetchall()
|
||||
else:
|
||||
return self._c.execute(
|
||||
'SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid', (lc,)
|
||||
).fetchall()
|
||||
|
||||
def meanings(self, expr_uid, expr_tt):
|
||||
"""
|
||||
Return a list of meanings for an expression.
|
||||
|
||||
:param expr_uid: the expression's language variety, as a seven-character
|
||||
uniform identifier.
|
||||
:param expr_tt: the expression's text.
|
||||
:return: a list of Meaning objects.
|
||||
:rtype: list(Meaning)
|
||||
"""
|
||||
|
||||
expr_lv = self._uid_lv[expr_uid]
|
||||
|
||||
mn_info = {}
|
||||
|
||||
for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)):
|
||||
mn = i[0]
|
||||
uid = self._lv_uid[i[5]]
|
||||
|
||||
if not mn in mn_info:
|
||||
mn_info[mn] = {
|
||||
'uq': i[1],
|
||||
'ap': i[2],
|
||||
'ui': i[3],
|
||||
'ex': {expr_uid: [expr_tt]},
|
||||
}
|
||||
|
||||
if not uid in mn_info[mn]['ex']:
|
||||
mn_info[mn]['ex'][uid] = []
|
||||
|
||||
mn_info[mn]['ex'][uid].append(i[4])
|
||||
|
||||
return [Meaning(mn, mn_info[mn]) for mn in mn_info]
|
||||
|
||||
def translations(self, from_uid, from_tt, to_uid):
|
||||
"""
|
||||
Return a list of translations for an expression into a single language
|
||||
variety.
|
||||
|
||||
:param from_uid: the source expression's language variety, as a
|
||||
seven-character uniform identifier.
|
||||
:param from_tt: the source expression's text.
|
||||
:param to_uid: the target language variety, as a seven-character
|
||||
uniform identifier.
|
||||
:return a list of translation tuples. The first element is the expression
|
||||
text and the second element is the translation quality.
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
|
||||
from_lv = self._uid_lv[from_uid]
|
||||
to_lv = self._uid_lv[to_uid]
|
||||
|
||||
return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall()
|
||||
|
||||
|
||||
class Meaning(dict):
|
||||
"""
|
||||
Represents a single PanLex meaning. A meaning is a translation set derived
|
||||
from a single source.
|
||||
"""
|
||||
|
||||
def __init__(self, mn, attr):
|
||||
super(Meaning, self).__init__(**attr)
|
||||
self['mn'] = mn
|
||||
|
||||
def id(self):
|
||||
"""
|
||||
:return: the meaning's id.
|
||||
:rtype: int
|
||||
"""
|
||||
return self['mn']
|
||||
|
||||
def quality(self):
|
||||
"""
|
||||
:return: the meaning's source's quality (0=worst, 9=best).
|
||||
:rtype: int
|
||||
"""
|
||||
return self['uq']
|
||||
|
||||
def source(self):
|
||||
"""
|
||||
:return: the meaning's source id.
|
||||
:rtype: int
|
||||
"""
|
||||
return self['ap']
|
||||
|
||||
def source_group(self):
|
||||
"""
|
||||
:return: the meaning's source group id.
|
||||
:rtype: int
|
||||
"""
|
||||
return self['ui']
|
||||
|
||||
def expressions(self):
|
||||
"""
|
||||
:return: the meaning's expressions as a dictionary whose keys are language
|
||||
variety uniform identifiers and whose values are lists of expression
|
||||
texts.
|
||||
:rtype: dict
|
||||
"""
|
||||
return self['ex']
|
||||
@@ -0,0 +1,94 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Natural Language Toolkit: Word List Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
from __future__ import print_function
|
||||
from collections import namedtuple, defaultdict
|
||||
import re
|
||||
from six import string_types
|
||||
|
||||
|
||||
from nltk.tokenize import line_tokenize
|
||||
|
||||
from nltk.corpus.reader.wordlist import WordListCorpusReader
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
PanlexLanguage = namedtuple('PanlexLanguage',
|
||||
['panlex_uid', # (1) PanLex UID
|
||||
'iso639', # (2) ISO 639 language code
|
||||
'iso639_type', # (3) ISO 639 language type, see README
|
||||
'script', # (4) normal scripts of expressions
|
||||
'name', # (5) PanLex default name
|
||||
'langvar_uid' # (6) UID of the language variety in which the default name is an expression
|
||||
])
|
||||
|
||||
class PanlexSwadeshCorpusReader(WordListCorpusReader):
|
||||
"""
|
||||
This is a class to read the PanLex Swadesh list from
|
||||
|
||||
David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
|
||||
PanLex: Building a Resource for Panlingual Lexical Translation.
|
||||
In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
|
||||
|
||||
License: CC0 1.0 Universal
|
||||
https://creativecommons.org/publicdomain/zero/1.0/legalcode
|
||||
"""
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(PanlexSwadeshCorpusReader, self).__init__(*args, **kwargs)
|
||||
# Find the swadesh size using the fileids' path.
|
||||
self.swadesh_size = re.match(r'swadesh([0-9].*)\/', self.fileids()[0]).group(1)
|
||||
self._languages = {lang.panlex_uid:lang for lang in self.get_languages()}
|
||||
self._macro_langauges = self.get_macrolanguages()
|
||||
|
||||
def license(self):
|
||||
print('CC0 1.0 Universal')
|
||||
|
||||
def readme(self):
|
||||
print(self.raw('README'))
|
||||
|
||||
def language_codes(self):
|
||||
return self._languages.keys()
|
||||
|
||||
def get_languages(self):
|
||||
for line in self.raw('langs{}.txt'.format(self.swadesh_size)).split('\n'):
|
||||
if not line.strip(): # Skip empty lines.
|
||||
continue
|
||||
yield PanlexLanguage(*line.strip().split('\t'))
|
||||
|
||||
def get_macrolanguages(self):
|
||||
macro_langauges = defaultdict(list)
|
||||
for lang in self._languages.values():
|
||||
macro_langauges[lang.iso639].append(lang.panlex_uid)
|
||||
return macro_langauges
|
||||
|
||||
def words_by_lang(self, lang_code):
|
||||
"""
|
||||
:return: a list of list(str)
|
||||
"""
|
||||
fileid = 'swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
|
||||
return [concept.split('\t') for concept in self.words(fileid)]
|
||||
|
||||
def words_by_iso639(self, iso63_code):
|
||||
"""
|
||||
:return: a list of list(str)
|
||||
"""
|
||||
fileids = ['swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
|
||||
for lang_code in self._macro_langauges[iso63_code]]
|
||||
return [concept.split('\t') for fileid in fileids for concept in self.words(fileid)]
|
||||
|
||||
def entries(self, fileids=None):
|
||||
"""
|
||||
:return: a tuple of words for the specified fileids.
|
||||
"""
|
||||
if not fileids:
|
||||
fileids = self.fileids()
|
||||
|
||||
wordlists = [self.words(f) for f in fileids]
|
||||
return list(zip(*wordlists))
|
||||
383
venv/lib/python3.7/site-packages/nltk/corpus/reader/pl196x.py
Normal file
383
venv/lib/python3.7/site-packages/nltk/corpus/reader/pl196x.py
Normal file
@@ -0,0 +1,383 @@
|
||||
# Natural Language Toolkit:
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.xmldocs import XMLCorpusReader
|
||||
|
||||
|
||||
PARA = re.compile(r'<p(?: [^>]*){0,1}>(.*?)</p>')
|
||||
SENT = re.compile(r'<s(?: [^>]*){0,1}>(.*?)</s>')
|
||||
|
||||
TAGGEDWORD = re.compile(r'<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>')
|
||||
WORD = re.compile(r'<[wc](?: [^>]*){0,1}>(.*?)</[wc]>')
|
||||
|
||||
TYPE = re.compile(r'type="(.*?)"')
|
||||
ANA = re.compile(r'ana="(.*?)"')
|
||||
|
||||
TEXTID = re.compile(r'text id="(.*?)"')
|
||||
|
||||
|
||||
class TEICorpusView(StreamBackedCorpusView):
|
||||
def __init__(
|
||||
self,
|
||||
corpus_file,
|
||||
tagged,
|
||||
group_by_sent,
|
||||
group_by_para,
|
||||
tagset=None,
|
||||
head_len=0,
|
||||
textids=None,
|
||||
):
|
||||
|
||||
self._tagged = tagged
|
||||
self._textids = textids
|
||||
|
||||
self._group_by_sent = group_by_sent
|
||||
self._group_by_para = group_by_para
|
||||
# WARNING -- skip header
|
||||
StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len)
|
||||
|
||||
_pagesize = 4096
|
||||
|
||||
def read_block(self, stream):
|
||||
block = stream.readlines(self._pagesize)
|
||||
block = concat(block)
|
||||
while (block.count('<text id') > block.count('</text>')) or block.count(
|
||||
'<text id'
|
||||
) == 0:
|
||||
tmp = stream.readline()
|
||||
if len(tmp) <= 0:
|
||||
break
|
||||
block += tmp
|
||||
|
||||
block = block.replace('\n', '')
|
||||
|
||||
textids = TEXTID.findall(block)
|
||||
if self._textids:
|
||||
for tid in textids:
|
||||
if tid not in self._textids:
|
||||
beg = block.find(tid) - 1
|
||||
end = block[beg:].find('</text>') + len('</text>')
|
||||
block = block[:beg] + block[beg + end :]
|
||||
|
||||
output = []
|
||||
for para_str in PARA.findall(block):
|
||||
para = []
|
||||
for sent_str in SENT.findall(para_str):
|
||||
if not self._tagged:
|
||||
sent = WORD.findall(sent_str)
|
||||
else:
|
||||
sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
|
||||
if self._group_by_sent:
|
||||
para.append(sent)
|
||||
else:
|
||||
para.extend(sent)
|
||||
if self._group_by_para:
|
||||
output.append(para)
|
||||
else:
|
||||
output.extend(para)
|
||||
return output
|
||||
|
||||
def _parse_tag(self, tag_word_tuple):
|
||||
(tag, word) = tag_word_tuple
|
||||
if tag.startswith('w'):
|
||||
tag = ANA.search(tag).group(1)
|
||||
else: # tag.startswith('c')
|
||||
tag = TYPE.search(tag).group(1)
|
||||
return word, tag
|
||||
|
||||
|
||||
class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
|
||||
head_len = 2770
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
if 'textid_file' in kwargs:
|
||||
self._textids = kwargs['textid_file']
|
||||
else:
|
||||
self._textids = None
|
||||
|
||||
XMLCorpusReader.__init__(self, *args)
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
|
||||
self._init_textids()
|
||||
|
||||
def _init_textids(self):
|
||||
self._f2t = defaultdict(list)
|
||||
self._t2f = defaultdict(list)
|
||||
if self._textids is not None:
|
||||
with open(self._textids) as fp:
|
||||
for line in fp:
|
||||
line = line.strip()
|
||||
file_id, text_ids = line.split(' ', 1)
|
||||
if file_id not in self.fileids():
|
||||
raise ValueError(
|
||||
'In text_id mapping file %s: %s not found'
|
||||
% (self._textids, file_id)
|
||||
)
|
||||
for text_id in text_ids.split(self._delimiter):
|
||||
self._add_textids(file_id, text_id)
|
||||
|
||||
def _add_textids(self, file_id, text_id):
|
||||
self._f2t[file_id].append(text_id)
|
||||
self._t2f[text_id].append(file_id)
|
||||
|
||||
def _resolve(self, fileids, categories, textids=None):
|
||||
tmp = None
|
||||
if (
|
||||
len(
|
||||
filter(
|
||||
lambda accessor: accessor is None, (fileids, categories, textids)
|
||||
)
|
||||
)
|
||||
!= 1
|
||||
):
|
||||
|
||||
raise ValueError(
|
||||
'Specify exactly one of: fileids, ' 'categories or textids'
|
||||
)
|
||||
|
||||
if fileids is not None:
|
||||
return fileids, None
|
||||
|
||||
if categories is not None:
|
||||
return self.fileids(categories), None
|
||||
|
||||
if textids is not None:
|
||||
if isinstance(textids, string_types):
|
||||
textids = [textids]
|
||||
files = sum((self._t2f[t] for t in textids), [])
|
||||
tdict = dict()
|
||||
for f in files:
|
||||
tdict[f] = set(self._f2t[f]) & set(textids)
|
||||
return files, tdict
|
||||
|
||||
def decode_tag(self, tag):
|
||||
# to be implemented
|
||||
return tag
|
||||
|
||||
def textids(self, fileids=None, categories=None):
|
||||
"""
|
||||
In the pl196x corpus each category is stored in single
|
||||
file and thus both methods provide identical functionality. In order
|
||||
to accommodate finer granularity, a non-standard textids() method was
|
||||
implemented. All the main functions can be supplied with a list
|
||||
of required chunks---giving much more control to the user.
|
||||
"""
|
||||
fileids, _ = self._resolve(fileids, categories)
|
||||
if fileids is None:
|
||||
return sorted(self._t2f)
|
||||
|
||||
if isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return sorted(sum((self._f2t[d] for d in fileids), []))
|
||||
|
||||
def words(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
head_len=self.head_len,
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
False,
|
||||
True,
|
||||
False,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid), False, True, False, head_len=self.head_len
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
False,
|
||||
True,
|
||||
True,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid), False, True, True, head_len=self.head_len
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
True,
|
||||
False,
|
||||
False,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid), True, False, False, head_len=self.head_len
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
True,
|
||||
True,
|
||||
False,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid), True, True, False, head_len=self.head_len
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_paras(self, fileids=None, categories=None, textids=None):
|
||||
fileids, textids = self._resolve(fileids, categories, textids)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
|
||||
if textids:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid),
|
||||
True,
|
||||
True,
|
||||
True,
|
||||
head_len=self.head_len,
|
||||
textids=textids[fileid],
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
else:
|
||||
return concat(
|
||||
[
|
||||
TEICorpusView(
|
||||
self.abspath(fileid), True, True, True, head_len=self.head_len
|
||||
)
|
||||
for fileid in fileids
|
||||
]
|
||||
)
|
||||
|
||||
def xml(self, fileids=None, categories=None):
|
||||
fileids, _ = self._resolve(fileids, categories)
|
||||
if len(fileids) == 1:
|
||||
return XMLCorpusReader.xml(self, fileids[0])
|
||||
else:
|
||||
raise TypeError('Expected a single file')
|
||||
|
||||
def raw(self, fileids=None, categories=None):
|
||||
fileids, _ = self._resolve(fileids, categories)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
263
venv/lib/python3.7/site-packages/nltk/corpus/reader/plaintext.py
Normal file
263
venv/lib/python3.7/site-packages/nltk/corpus/reader/plaintext.py
Normal file
@@ -0,0 +1,263 @@
|
||||
# Natural Language Toolkit: Plaintext Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# Nitin Madnani <nmadnani@umiacs.umd.edu>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A reader for corpora that consist of plaintext documents.
|
||||
"""
|
||||
|
||||
import nltk.data
|
||||
from nltk.tokenize import *
|
||||
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
|
||||
class PlaintextCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for corpora that consist of plaintext documents. Paragraphs
|
||||
are assumed to be split using blank lines. Sentences and words can
|
||||
be tokenized using the default tokenizers, or by custom tokenizers
|
||||
specificed as parameters to the constructor.
|
||||
|
||||
This corpus reader can be customized (e.g., to skip preface
|
||||
sections of specific document formats) by creating a subclass and
|
||||
overriding the ``CorpusView`` class variable.
|
||||
"""
|
||||
|
||||
CorpusView = StreamBackedCorpusView
|
||||
"""The corpus view class used by this reader. Subclasses of
|
||||
``PlaintextCorpusReader`` may specify alternative corpus view
|
||||
classes (e.g., to skip the preface sections of documents.)"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
word_tokenizer=WordPunctTokenizer(),
|
||||
sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/english.pickle'),
|
||||
para_block_reader=read_blankline_block,
|
||||
encoding='utf8',
|
||||
):
|
||||
"""
|
||||
Construct a new plaintext corpus reader for a set of documents
|
||||
located at the given root directory. Example usage:
|
||||
|
||||
>>> root = '/usr/local/share/nltk_data/corpora/webtext/'
|
||||
>>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP
|
||||
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
:param word_tokenizer: Tokenizer for breaking sentences or
|
||||
paragraphs into words.
|
||||
:param sent_tokenizer: Tokenizer for breaking paragraphs
|
||||
into words.
|
||||
:param para_block_reader: The block reader used to divide the
|
||||
corpus into paragraph blocks.
|
||||
"""
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._para_block_reader = para_block_reader
|
||||
|
||||
def raw(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a single string.
|
||||
:rtype: str
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
raw_texts = []
|
||||
for f in fileids:
|
||||
_fin = self.open(f)
|
||||
raw_texts.append(_fin.read())
|
||||
_fin.close()
|
||||
return concat(raw_texts)
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of words
|
||||
and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences or utterances, each encoded as a list of word
|
||||
strings.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
if self._sent_tokenizer is None:
|
||||
raise ValueError('No sentence tokenizer for this corpus')
|
||||
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
paragraphs, each encoded as a list of sentences, which are
|
||||
in turn encoded as lists of word strings.
|
||||
:rtype: list(list(list(str)))
|
||||
"""
|
||||
if self._sent_tokenizer is None:
|
||||
raise ValueError('No sentence tokenizer for this corpus')
|
||||
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_para_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
words.extend(self._word_tokenizer.tokenize(stream.readline()))
|
||||
return words
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
sents = []
|
||||
for para in self._para_block_reader(stream):
|
||||
sents.extend(
|
||||
[
|
||||
self._word_tokenizer.tokenize(sent)
|
||||
for sent in self._sent_tokenizer.tokenize(para)
|
||||
]
|
||||
)
|
||||
return sents
|
||||
|
||||
def _read_para_block(self, stream):
|
||||
paras = []
|
||||
for para in self._para_block_reader(stream):
|
||||
paras.append(
|
||||
[
|
||||
self._word_tokenizer.tokenize(sent)
|
||||
for sent in self._sent_tokenizer.tokenize(para)
|
||||
]
|
||||
)
|
||||
return paras
|
||||
|
||||
|
||||
class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader):
|
||||
"""
|
||||
A reader for plaintext corpora whose documents are divided into
|
||||
categories based on their file identifiers.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""
|
||||
Initialize the corpus reader. Categorization arguments
|
||||
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
|
||||
the ``CategorizedCorpusReader`` constructor. The remaining arguments
|
||||
are passed to the ``PlaintextCorpusReader`` constructor.
|
||||
"""
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
PlaintextCorpusReader.__init__(self, *args, **kwargs)
|
||||
|
||||
def _resolve(self, fileids, categories):
|
||||
if fileids is not None and categories is not None:
|
||||
raise ValueError('Specify fileids or categories, not both')
|
||||
if categories is not None:
|
||||
return self.fileids(categories)
|
||||
else:
|
||||
return fileids
|
||||
|
||||
def raw(self, fileids=None, categories=None):
|
||||
return PlaintextCorpusReader.raw(self, self._resolve(fileids, categories))
|
||||
|
||||
def words(self, fileids=None, categories=None):
|
||||
return PlaintextCorpusReader.words(self, self._resolve(fileids, categories))
|
||||
|
||||
def sents(self, fileids=None, categories=None):
|
||||
return PlaintextCorpusReader.sents(self, self._resolve(fileids, categories))
|
||||
|
||||
def paras(self, fileids=None, categories=None):
|
||||
return PlaintextCorpusReader.paras(self, self._resolve(fileids, categories))
|
||||
|
||||
|
||||
# FIXME: Is there a better way? How to not hardcode this?
|
||||
# Possibly, add a language kwargs to CategorizedPlaintextCorpusReader to
|
||||
# override the `sent_tokenizer`.
|
||||
class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
|
||||
def __init__(self, *args, **kwargs):
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
kwargs['sent_tokenizer'] = nltk.data.LazyLoader(
|
||||
'tokenizers/punkt/portuguese.pickle'
|
||||
)
|
||||
PlaintextCorpusReader.__init__(self, *args, **kwargs)
|
||||
|
||||
|
||||
class EuroparlCorpusReader(PlaintextCorpusReader):
|
||||
|
||||
"""
|
||||
Reader for Europarl corpora that consist of plaintext documents.
|
||||
Documents are divided into chapters instead of paragraphs as
|
||||
for regular plaintext documents. Chapters are separated using blank
|
||||
lines. Everything is inherited from ``PlaintextCorpusReader`` except
|
||||
that:
|
||||
- Since the corpus is pre-processed and pre-tokenized, the
|
||||
word tokenizer should just split the line at whitespaces.
|
||||
- For the same reason, the sentence tokenizer should just
|
||||
split the paragraph at line breaks.
|
||||
- There is a new 'chapters()' method that returns chapters instead
|
||||
instead of paragraphs.
|
||||
- The 'paras()' method inherited from PlaintextCorpusReader is
|
||||
made non-functional to remove any confusion between chapters
|
||||
and paragraphs for Europarl.
|
||||
"""
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
words.extend(stream.readline().split())
|
||||
return words
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
sents = []
|
||||
for para in self._para_block_reader(stream):
|
||||
sents.extend([sent.split() for sent in para.splitlines()])
|
||||
return sents
|
||||
|
||||
def _read_para_block(self, stream):
|
||||
paras = []
|
||||
for para in self._para_block_reader(stream):
|
||||
paras.append([sent.split() for sent in para.splitlines()])
|
||||
return paras
|
||||
|
||||
def chapters(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
chapters, each encoded as a list of sentences, which are
|
||||
in turn encoded as lists of word strings.
|
||||
:rtype: list(list(list(str)))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(fileid, self._read_para_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None):
|
||||
raise NotImplementedError(
|
||||
'The Europarl corpus reader does not support paragraphs. Please use chapters() instead.'
|
||||
)
|
||||
107
venv/lib/python3.7/site-packages/nltk/corpus/reader/ppattach.py
Normal file
107
venv/lib/python3.7/site-packages/nltk/corpus/reader/ppattach.py
Normal file
@@ -0,0 +1,107 @@
|
||||
# Natural Language Toolkit: PP Attachment Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Read lines from the Prepositional Phrase Attachment Corpus.
|
||||
|
||||
The PP Attachment Corpus contains several files having the format:
|
||||
|
||||
sentence_id verb noun1 preposition noun2 attachment
|
||||
|
||||
For example:
|
||||
|
||||
42960 gives authority to administration V
|
||||
46742 gives inventors of microchip N
|
||||
|
||||
The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.:
|
||||
|
||||
(VP gives (NP authority) (PP to administration))
|
||||
(VP gives (NP inventors (PP of microchip)))
|
||||
|
||||
The corpus contains the following files:
|
||||
|
||||
training: training set
|
||||
devset: development test set, used for algorithm development.
|
||||
test: test set, used to report results
|
||||
bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal.
|
||||
|
||||
Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional
|
||||
Phrase Attachment. Proceedings of the ARPA Human Language Technology
|
||||
Conference. [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps]
|
||||
|
||||
The PP Attachment Corpus is distributed with NLTK with the permission
|
||||
of the author.
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk import compat
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class PPAttachment(object):
|
||||
def __init__(self, sent, verb, noun1, prep, noun2, attachment):
|
||||
self.sent = sent
|
||||
self.verb = verb
|
||||
self.noun1 = noun1
|
||||
self.prep = prep
|
||||
self.noun2 = noun2
|
||||
self.attachment = attachment
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
'PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, '
|
||||
'noun2=%r, attachment=%r)'
|
||||
% (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment)
|
||||
)
|
||||
|
||||
|
||||
class PPAttachmentCorpusReader(CorpusReader):
|
||||
"""
|
||||
sentence_id verb noun1 preposition noun2 attachment
|
||||
"""
|
||||
|
||||
def attachments(self, fileids):
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_obj_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tuples(self, fileids):
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def raw(self, fileids=None):
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
def _read_tuple_block(self, stream):
|
||||
line = stream.readline()
|
||||
if line:
|
||||
return [tuple(line.split())]
|
||||
else:
|
||||
return []
|
||||
|
||||
def _read_obj_block(self, stream):
|
||||
line = stream.readline()
|
||||
if line:
|
||||
return [PPAttachment(*line.split())]
|
||||
else:
|
||||
return []
|
||||
539
venv/lib/python3.7/site-packages/nltk/corpus/reader/propbank.py
Normal file
539
venv/lib/python3.7/site-packages/nltk/corpus/reader/propbank.py
Normal file
@@ -0,0 +1,539 @@
|
||||
# Natural Language Toolkit: PropBank Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from __future__ import unicode_literals
|
||||
import re
|
||||
from functools import total_ordering
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk.tree import Tree
|
||||
from nltk.internals import raise_unorderable_types
|
||||
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
|
||||
class PropbankCorpusReader(CorpusReader):
|
||||
"""
|
||||
Corpus reader for the propbank corpus, which augments the Penn
|
||||
Treebank with information about the predicate argument structure
|
||||
of every verb instance. The corpus consists of two parts: the
|
||||
predicate-argument annotations themselves, and a set of "frameset
|
||||
files" which define the argument labels used by the annotations,
|
||||
on a per-verb basis. Each "frameset file" contains one or more
|
||||
predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
|
||||
divided into coarse-grained word senses called "rolesets". For
|
||||
each "roleset", the frameset file provides descriptions of the
|
||||
argument roles, along with examples.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
propfile,
|
||||
framefiles='',
|
||||
verbsfile=None,
|
||||
parse_fileid_xform=None,
|
||||
parse_corpus=None,
|
||||
encoding='utf8',
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param propfile: The name of the file containing the predicate-
|
||||
argument annotations (relative to ``root``).
|
||||
:param framefiles: A list or regexp specifying the frameset
|
||||
fileids for this corpus.
|
||||
:param parse_fileid_xform: A transform that should be applied
|
||||
to the fileids in this corpus. This should be a function
|
||||
of one argument (a fileid) that returns a string (the new
|
||||
fileid).
|
||||
:param parse_corpus: The corpus containing the parse trees
|
||||
corresponding to this corpus. These parse trees are
|
||||
necessary to resolve the tree pointers used by propbank.
|
||||
"""
|
||||
# If framefiles is specified as a regexp, expand it.
|
||||
if isinstance(framefiles, string_types):
|
||||
framefiles = find_corpus_fileids(root, framefiles)
|
||||
framefiles = list(framefiles)
|
||||
# Initialze the corpus reader.
|
||||
CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding)
|
||||
|
||||
# Record our frame fileids & prop file.
|
||||
self._propfile = propfile
|
||||
self._framefiles = framefiles
|
||||
self._verbsfile = verbsfile
|
||||
self._parse_fileid_xform = parse_fileid_xform
|
||||
self._parse_corpus = parse_corpus
|
||||
|
||||
def raw(self, fileids=None):
|
||||
"""
|
||||
:return: the text contents of the given fileids, as a single string.
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
def instances(self, baseform=None):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of
|
||||
``PropBankInstance`` objects, one for each noun in the corpus.
|
||||
"""
|
||||
kwargs = {}
|
||||
if baseform is not None:
|
||||
kwargs['instance_filter'] = lambda inst: inst.baseform == baseform
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._propfile),
|
||||
lambda stream: self._read_instance_block(stream, **kwargs),
|
||||
encoding=self.encoding(self._propfile),
|
||||
)
|
||||
|
||||
def lines(self):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of strings, one for
|
||||
each line in the predicate-argument annotation file.
|
||||
"""
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._propfile),
|
||||
read_line_block,
|
||||
encoding=self.encoding(self._propfile),
|
||||
)
|
||||
|
||||
def roleset(self, roleset_id):
|
||||
"""
|
||||
:return: the xml description for the given roleset.
|
||||
"""
|
||||
baseform = roleset_id.split('.')[0]
|
||||
framefile = 'frames/%s.xml' % baseform
|
||||
if framefile not in self._framefiles:
|
||||
raise ValueError('Frameset file for %s not found' % roleset_id)
|
||||
|
||||
# n.b.: The encoding for XML fileids is specified by the file
|
||||
# itself; so we ignore self._encoding here.
|
||||
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
|
||||
for roleset in etree.findall('predicate/roleset'):
|
||||
if roleset.attrib['id'] == roleset_id:
|
||||
return roleset
|
||||
raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
|
||||
|
||||
def rolesets(self, baseform=None):
|
||||
"""
|
||||
:return: list of xml descriptions for rolesets.
|
||||
"""
|
||||
if baseform is not None:
|
||||
framefile = 'frames/%s.xml' % baseform
|
||||
if framefile not in self._framefiles:
|
||||
raise ValueError('Frameset file for %s not found' % baseform)
|
||||
framefiles = [framefile]
|
||||
else:
|
||||
framefiles = self._framefiles
|
||||
|
||||
rsets = []
|
||||
for framefile in framefiles:
|
||||
# n.b.: The encoding for XML fileids is specified by the file
|
||||
# itself; so we ignore self._encoding here.
|
||||
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
|
||||
rsets.append(etree.findall('predicate/roleset'))
|
||||
return LazyConcatenation(rsets)
|
||||
|
||||
def verbs(self):
|
||||
"""
|
||||
:return: a corpus view that acts as a list of all verb lemmas
|
||||
in this corpus (from the verbs.txt file).
|
||||
"""
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath(self._verbsfile),
|
||||
read_line_block,
|
||||
encoding=self.encoding(self._verbsfile),
|
||||
)
|
||||
|
||||
def _read_instance_block(self, stream, instance_filter=lambda inst: True):
|
||||
block = []
|
||||
|
||||
# Read 100 at a time.
|
||||
for i in range(100):
|
||||
line = stream.readline().strip()
|
||||
if line:
|
||||
inst = PropbankInstance.parse(
|
||||
line, self._parse_fileid_xform, self._parse_corpus
|
||||
)
|
||||
if instance_filter(inst):
|
||||
block.append(inst)
|
||||
|
||||
return block
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Propbank Instance & related datatypes
|
||||
######################################################################
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class PropbankInstance(object):
|
||||
def __init__(
|
||||
self,
|
||||
fileid,
|
||||
sentnum,
|
||||
wordnum,
|
||||
tagger,
|
||||
roleset,
|
||||
inflection,
|
||||
predicate,
|
||||
arguments,
|
||||
parse_corpus=None,
|
||||
):
|
||||
|
||||
self.fileid = fileid
|
||||
"""The name of the file containing the parse tree for this
|
||||
instance's sentence."""
|
||||
|
||||
self.sentnum = sentnum
|
||||
"""The sentence number of this sentence within ``fileid``.
|
||||
Indexing starts from zero."""
|
||||
|
||||
self.wordnum = wordnum
|
||||
"""The word number of this instance's predicate within its
|
||||
containing sentence. Word numbers are indexed starting from
|
||||
zero, and include traces and other empty parse elements."""
|
||||
|
||||
self.tagger = tagger
|
||||
"""An identifier for the tagger who tagged this instance; or
|
||||
``'gold'`` if this is an adjuticated instance."""
|
||||
|
||||
self.roleset = roleset
|
||||
"""The name of the roleset used by this instance's predicate.
|
||||
Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to
|
||||
look up information about the roleset."""
|
||||
|
||||
self.inflection = inflection
|
||||
"""A ``PropbankInflection`` object describing the inflection of
|
||||
this instance's predicate."""
|
||||
|
||||
self.predicate = predicate
|
||||
"""A ``PropbankTreePointer`` indicating the position of this
|
||||
instance's predicate within its containing sentence."""
|
||||
|
||||
self.arguments = tuple(arguments)
|
||||
"""A list of tuples (argloc, argid), specifying the location
|
||||
and identifier for each of the predicate's argument in the
|
||||
containing sentence. Argument identifiers are strings such as
|
||||
``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
|
||||
the predicate."""
|
||||
|
||||
self.parse_corpus = parse_corpus
|
||||
"""A corpus reader for the parse trees corresponding to the
|
||||
instances in this propbank corpus."""
|
||||
|
||||
@property
|
||||
def baseform(self):
|
||||
"""The baseform of the predicate."""
|
||||
return self.roleset.split('.')[0]
|
||||
|
||||
@property
|
||||
def sensenumber(self):
|
||||
"""The sense number of the predicate."""
|
||||
return self.roleset.split('.')[1]
|
||||
|
||||
@property
|
||||
def predid(self):
|
||||
"""Identifier of the predicate."""
|
||||
return 'rel'
|
||||
|
||||
def __repr__(self):
|
||||
return '<PropbankInstance: %s, sent %s, word %s>' % (
|
||||
self.fileid,
|
||||
self.sentnum,
|
||||
self.wordnum,
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
s = '%s %s %s %s %s %s' % (
|
||||
self.fileid,
|
||||
self.sentnum,
|
||||
self.wordnum,
|
||||
self.tagger,
|
||||
self.roleset,
|
||||
self.inflection,
|
||||
)
|
||||
items = self.arguments + ((self.predicate, 'rel'),)
|
||||
for (argloc, argid) in sorted(items):
|
||||
s += ' %s-%s' % (argloc, argid)
|
||||
return s
|
||||
|
||||
def _get_tree(self):
|
||||
if self.parse_corpus is None:
|
||||
return None
|
||||
if self.fileid not in self.parse_corpus.fileids():
|
||||
return None
|
||||
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
|
||||
|
||||
tree = property(
|
||||
_get_tree,
|
||||
doc="""
|
||||
The parse tree corresponding to this instance, or None if
|
||||
the corresponding tree is not available.""",
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def parse(s, parse_fileid_xform=None, parse_corpus=None):
|
||||
pieces = s.split()
|
||||
if len(pieces) < 7:
|
||||
raise ValueError('Badly formatted propbank line: %r' % s)
|
||||
|
||||
# Divide the line into its basic pieces.
|
||||
(fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]
|
||||
rel = [p for p in pieces[6:] if p.endswith('-rel')]
|
||||
args = [p for p in pieces[6:] if not p.endswith('-rel')]
|
||||
if len(rel) != 1:
|
||||
raise ValueError('Badly formatted propbank line: %r' % s)
|
||||
|
||||
# Apply the fileid selector, if any.
|
||||
if parse_fileid_xform is not None:
|
||||
fileid = parse_fileid_xform(fileid)
|
||||
|
||||
# Convert sentence & word numbers to ints.
|
||||
sentnum = int(sentnum)
|
||||
wordnum = int(wordnum)
|
||||
|
||||
# Parse the inflection
|
||||
inflection = PropbankInflection.parse(inflection)
|
||||
|
||||
# Parse the predicate location.
|
||||
predicate = PropbankTreePointer.parse(rel[0][:-4])
|
||||
|
||||
# Parse the arguments.
|
||||
arguments = []
|
||||
for arg in args:
|
||||
argloc, argid = arg.split('-', 1)
|
||||
arguments.append((PropbankTreePointer.parse(argloc), argid))
|
||||
|
||||
# Put it all together.
|
||||
return PropbankInstance(
|
||||
fileid,
|
||||
sentnum,
|
||||
wordnum,
|
||||
tagger,
|
||||
roleset,
|
||||
inflection,
|
||||
predicate,
|
||||
arguments,
|
||||
parse_corpus,
|
||||
)
|
||||
|
||||
|
||||
class PropbankPointer(object):
|
||||
"""
|
||||
A pointer used by propbank to identify one or more constituents in
|
||||
a parse tree. ``PropbankPointer`` is an abstract base class with
|
||||
three concrete subclasses:
|
||||
|
||||
- ``PropbankTreePointer`` is used to point to single constituents.
|
||||
- ``PropbankSplitTreePointer`` is used to point to 'split'
|
||||
constituents, which consist of a sequence of two or more
|
||||
``PropbankTreePointer`` pointers.
|
||||
- ``PropbankChainTreePointer`` is used to point to entire trace
|
||||
chains in a tree. It consists of a sequence of pieces, which
|
||||
can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
if self.__class__ == PropbankPointer:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class PropbankChainTreePointer(PropbankPointer):
|
||||
def __init__(self, pieces):
|
||||
self.pieces = pieces
|
||||
"""A list of the pieces that make up this chain. Elements may
|
||||
be either ``PropbankSplitTreePointer`` or
|
||||
``PropbankTreePointer`` pointers."""
|
||||
|
||||
def __str__(self):
|
||||
return '*'.join('%s' % p for p in self.pieces)
|
||||
|
||||
def __repr__(self):
|
||||
return '<PropbankChainTreePointer: %s>' % self
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError('Parse tree not avaialable')
|
||||
return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class PropbankSplitTreePointer(PropbankPointer):
|
||||
def __init__(self, pieces):
|
||||
self.pieces = pieces
|
||||
"""A list of the pieces that make up this chain. Elements are
|
||||
all ``PropbankTreePointer`` pointers."""
|
||||
|
||||
def __str__(self):
|
||||
return ','.join('%s' % p for p in self.pieces)
|
||||
|
||||
def __repr__(self):
|
||||
return '<PropbankSplitTreePointer: %s>' % self
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError('Parse tree not avaialable')
|
||||
return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
|
||||
|
||||
|
||||
@total_ordering
|
||||
@compat.python_2_unicode_compatible
|
||||
class PropbankTreePointer(PropbankPointer):
|
||||
"""
|
||||
wordnum:height*wordnum:height*...
|
||||
wordnum:height,
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, wordnum, height):
|
||||
self.wordnum = wordnum
|
||||
self.height = height
|
||||
|
||||
@staticmethod
|
||||
def parse(s):
|
||||
# Deal with chains (xx*yy*zz)
|
||||
pieces = s.split('*')
|
||||
if len(pieces) > 1:
|
||||
return PropbankChainTreePointer(
|
||||
[PropbankTreePointer.parse(elt) for elt in pieces]
|
||||
)
|
||||
|
||||
# Deal with split args (xx,yy,zz)
|
||||
pieces = s.split(',')
|
||||
if len(pieces) > 1:
|
||||
return PropbankSplitTreePointer(
|
||||
[PropbankTreePointer.parse(elt) for elt in pieces]
|
||||
)
|
||||
|
||||
# Deal with normal pointers.
|
||||
pieces = s.split(':')
|
||||
if len(pieces) != 2:
|
||||
raise ValueError('bad propbank pointer %r' % s)
|
||||
return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
|
||||
|
||||
def __str__(self):
|
||||
return '%s:%s' % (self.wordnum, self.height)
|
||||
|
||||
def __repr__(self):
|
||||
return 'PropbankTreePointer(%d, %d)' % (self.wordnum, self.height)
|
||||
|
||||
def __eq__(self, other):
|
||||
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
|
||||
other = other.pieces[0]
|
||||
|
||||
if not isinstance(other, PropbankTreePointer):
|
||||
return self is other
|
||||
|
||||
return self.wordnum == other.wordnum and self.height == other.height
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __lt__(self, other):
|
||||
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
|
||||
other = other.pieces[0]
|
||||
|
||||
if not isinstance(other, PropbankTreePointer):
|
||||
return id(self) < id(other)
|
||||
|
||||
return (self.wordnum, -self.height) < (other.wordnum, -other.height)
|
||||
|
||||
def select(self, tree):
|
||||
if tree is None:
|
||||
raise ValueError('Parse tree not avaialable')
|
||||
return tree[self.treepos(tree)]
|
||||
|
||||
def treepos(self, tree):
|
||||
"""
|
||||
Convert this pointer to a standard 'tree position' pointer,
|
||||
given that it points to the given tree.
|
||||
"""
|
||||
if tree is None:
|
||||
raise ValueError('Parse tree not avaialable')
|
||||
stack = [tree]
|
||||
treepos = []
|
||||
|
||||
wordnum = 0
|
||||
while True:
|
||||
# print treepos
|
||||
# print stack[-1]
|
||||
# tree node:
|
||||
if isinstance(stack[-1], Tree):
|
||||
# Select the next child.
|
||||
if len(treepos) < len(stack):
|
||||
treepos.append(0)
|
||||
else:
|
||||
treepos[-1] += 1
|
||||
# Update the stack.
|
||||
if treepos[-1] < len(stack[-1]):
|
||||
stack.append(stack[-1][treepos[-1]])
|
||||
else:
|
||||
# End of node's child list: pop up a level.
|
||||
stack.pop()
|
||||
treepos.pop()
|
||||
# word node:
|
||||
else:
|
||||
if wordnum == self.wordnum:
|
||||
return tuple(treepos[: len(treepos) - self.height - 1])
|
||||
else:
|
||||
wordnum += 1
|
||||
stack.pop()
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class PropbankInflection(object):
|
||||
# { Inflection Form
|
||||
INFINITIVE = 'i'
|
||||
GERUND = 'g'
|
||||
PARTICIPLE = 'p'
|
||||
FINITE = 'v'
|
||||
# { Inflection Tense
|
||||
FUTURE = 'f'
|
||||
PAST = 'p'
|
||||
PRESENT = 'n'
|
||||
# { Inflection Aspect
|
||||
PERFECT = 'p'
|
||||
PROGRESSIVE = 'o'
|
||||
PERFECT_AND_PROGRESSIVE = 'b'
|
||||
# { Inflection Person
|
||||
THIRD_PERSON = '3'
|
||||
# { Inflection Voice
|
||||
ACTIVE = 'a'
|
||||
PASSIVE = 'p'
|
||||
# { Inflection
|
||||
NONE = '-'
|
||||
# }
|
||||
|
||||
def __init__(self, form='-', tense='-', aspect='-', person='-', voice='-'):
|
||||
self.form = form
|
||||
self.tense = tense
|
||||
self.aspect = aspect
|
||||
self.person = person
|
||||
self.voice = voice
|
||||
|
||||
def __str__(self):
|
||||
return self.form + self.tense + self.aspect + self.person + self.voice
|
||||
|
||||
def __repr__(self):
|
||||
return '<PropbankInflection: %s>' % self
|
||||
|
||||
_VALIDATE = re.compile(r'[igpv\-][fpn\-][pob\-][3\-][ap\-]$')
|
||||
|
||||
@staticmethod
|
||||
def parse(s):
|
||||
if not isinstance(s, string_types):
|
||||
raise TypeError('expected a string')
|
||||
if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):
|
||||
raise ValueError('Bad propbank inflection string %r' % s)
|
||||
return PropbankInflection(*s)
|
||||
143
venv/lib/python3.7/site-packages/nltk/corpus/reader/pros_cons.py
Normal file
143
venv/lib/python3.7/site-packages/nltk/corpus/reader/pros_cons.py
Normal file
@@ -0,0 +1,143 @@
|
||||
# Natural Language Toolkit: Pros and Cons Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader for the Pros and Cons dataset.
|
||||
|
||||
- Pros and Cons dataset information -
|
||||
|
||||
Contact: Bing Liu, liub@cs.uic.edu
|
||||
http://www.cs.uic.edu/~liub
|
||||
|
||||
Distributed with permission.
|
||||
|
||||
Related papers:
|
||||
|
||||
- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
|
||||
Proceedings of the 22nd International Conference on Computational Linguistics
|
||||
(Coling-2008), Manchester, 18-22 August, 2008.
|
||||
|
||||
- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
|
||||
Opinions on the Web". Proceedings of the 14th international World Wide Web
|
||||
conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
|
||||
"""
|
||||
import re
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.tokenize import *
|
||||
|
||||
|
||||
class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
|
||||
"""
|
||||
Reader for the Pros and Cons sentence dataset.
|
||||
|
||||
>>> from nltk.corpus import pros_cons
|
||||
>>> pros_cons.sents(categories='Cons')
|
||||
[['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
|
||||
'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
|
||||
...]
|
||||
>>> pros_cons.words('IntegratedPros.txt')
|
||||
['Easy', 'to', 'use', ',', 'economical', '!', ...]
|
||||
"""
|
||||
|
||||
CorpusView = StreamBackedCorpusView
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
word_tokenizer=WordPunctTokenizer(),
|
||||
encoding='utf8',
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for the corpus.
|
||||
:param fileids: a list or regexp specifying the fileids in the corpus.
|
||||
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
|
||||
into words. Default: `WhitespaceTokenizer`
|
||||
:param encoding: the encoding that should be used to read the corpus.
|
||||
:param kwargs: additional parameters passed to CategorizedCorpusReader.
|
||||
"""
|
||||
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
self._word_tokenizer = word_tokenizer
|
||||
|
||||
def sents(self, fileids=None, categories=None):
|
||||
"""
|
||||
Return all sentences in the corpus or in the specified files/categories.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
sentences have to be returned.
|
||||
:param categories: a list specifying the categories whose sentences
|
||||
have to be returned.
|
||||
:return: the given file(s) as a list of sentences. Each sentence is
|
||||
tokenized using the specified word_tokenizer.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
fileids = self._resolve(fileids, categories)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None, categories=None):
|
||||
"""
|
||||
Return all words and punctuation symbols in the corpus or in the specified
|
||||
files/categories.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
words have to be returned.
|
||||
:param categories: a list specifying the categories whose words have
|
||||
to be returned.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
fileids = self._resolve(fileids, categories)
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
sents = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
continue
|
||||
sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>", line)
|
||||
if sent:
|
||||
sents.append(self._word_tokenizer.tokenize(sent.group(2).strip()))
|
||||
return sents
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for sent in self._read_sent_block(stream):
|
||||
words.extend(sent)
|
||||
return words
|
||||
|
||||
def _resolve(self, fileids, categories):
|
||||
if fileids is not None and categories is not None:
|
||||
raise ValueError('Specify fileids or categories, not both')
|
||||
if categories is not None:
|
||||
return self.fileids(categories)
|
||||
else:
|
||||
return fileids
|
||||
355
venv/lib/python3.7/site-packages/nltk/corpus/reader/reviews.py
Normal file
355
venv/lib/python3.7/site-packages/nltk/corpus/reader/reviews.py
Normal file
@@ -0,0 +1,355 @@
|
||||
# Natural Language Toolkit: Product Reviews Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
CorpusReader for reviews corpora (syntax based on Customer Review Corpus).
|
||||
|
||||
- Customer Review Corpus information -
|
||||
Annotated by: Minqing Hu and Bing Liu, 2004.
|
||||
Department of Computer Sicence
|
||||
University of Illinois at Chicago
|
||||
|
||||
Contact: Bing Liu, liub@cs.uic.edu
|
||||
http://www.cs.uic.edu/~liub
|
||||
|
||||
Distributed with permission.
|
||||
|
||||
The "product_reviews_1" and "product_reviews_2" datasets respectively contain
|
||||
annotated customer reviews of 5 and 9 products from amazon.com.
|
||||
|
||||
Related papers:
|
||||
|
||||
- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
|
||||
Proceedings of the ACM SIGKDD International Conference on Knowledge
|
||||
Discovery & Data Mining (KDD-04), 2004.
|
||||
|
||||
- Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews".
|
||||
Proceedings of Nineteeth National Conference on Artificial Intelligence
|
||||
(AAAI-2004), 2004.
|
||||
|
||||
- Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to
|
||||
Opinion Mining." Proceedings of First ACM International Conference on Web
|
||||
Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University,
|
||||
Stanford, California, USA.
|
||||
|
||||
Symbols used in the annotated reviews:
|
||||
|
||||
[t] : the title of the review: Each [t] tag starts a review.
|
||||
xxxx[+|-n]: xxxx is a product feature.
|
||||
[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest.
|
||||
Note that the strength is quite subjective.
|
||||
You may want ignore it, but only considering + and -
|
||||
[-n]: Negative opinion
|
||||
## : start of each sentence. Each line is a sentence.
|
||||
[u] : feature not appeared in the sentence.
|
||||
[p] : feature not appeared in the sentence. Pronoun resolution is needed.
|
||||
[s] : suggestion or recommendation.
|
||||
[cc]: comparison with a competing product from a different brand.
|
||||
[cs]: comparison with a competing product from the same brand.
|
||||
|
||||
Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
|
||||
provide separation between different reviews. This is due to the fact that
|
||||
the dataset was specifically designed for aspect/feature-based sentiment
|
||||
analysis, for which sentence-level annotation is sufficient. For document-
|
||||
level classification and analysis, this peculiarity should be taken into
|
||||
consideration.
|
||||
"""
|
||||
|
||||
from __future__ import division
|
||||
|
||||
import re
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.tokenize import *
|
||||
|
||||
TITLE = re.compile(r'^\[t\](.*)$') # [t] Title
|
||||
FEATURES = re.compile(
|
||||
r'((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]'
|
||||
) # find 'feature' in feature[+3]
|
||||
NOTES = re.compile(r'\[(?!t)(p|u|s|cc|cs)\]') # find 'p' in camera[+2][p]
|
||||
SENT = re.compile(r'##(.*)$') # find tokenized sentence
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class Review(object):
|
||||
"""
|
||||
A Review is the main block of a ReviewsCorpusReader.
|
||||
"""
|
||||
|
||||
def __init__(self, title=None, review_lines=None):
|
||||
"""
|
||||
:param title: the title of the review.
|
||||
:param review_lines: the list of the ReviewLines that belong to the Review.
|
||||
"""
|
||||
self.title = title
|
||||
if review_lines is None:
|
||||
self.review_lines = []
|
||||
else:
|
||||
self.review_lines = review_lines
|
||||
|
||||
def add_line(self, review_line):
|
||||
"""
|
||||
Add a line (ReviewLine) to the review.
|
||||
|
||||
:param review_line: a ReviewLine instance that belongs to the Review.
|
||||
"""
|
||||
assert isinstance(review_line, ReviewLine)
|
||||
self.review_lines.append(review_line)
|
||||
|
||||
def features(self):
|
||||
"""
|
||||
Return a list of features in the review. Each feature is a tuple made of
|
||||
the specific item feature and the opinion strength about that feature.
|
||||
|
||||
:return: all features of the review as a list of tuples (feat, score).
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
features = []
|
||||
for review_line in self.review_lines:
|
||||
features.extend(review_line.features)
|
||||
return features
|
||||
|
||||
def sents(self):
|
||||
"""
|
||||
Return all tokenized sentences in the review.
|
||||
|
||||
:return: all sentences of the review as lists of tokens.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return [review_line.sent for review_line in self.review_lines]
|
||||
|
||||
def __repr__(self):
|
||||
return 'Review(title=\"{}\", review_lines={})'.format(
|
||||
self.title, self.review_lines
|
||||
)
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class ReviewLine(object):
|
||||
"""
|
||||
A ReviewLine represents a sentence of the review, together with (optional)
|
||||
annotations of its features and notes about the reviewed item.
|
||||
"""
|
||||
|
||||
def __init__(self, sent, features=None, notes=None):
|
||||
self.sent = sent
|
||||
if features is None:
|
||||
self.features = []
|
||||
else:
|
||||
self.features = features
|
||||
|
||||
if notes is None:
|
||||
self.notes = []
|
||||
else:
|
||||
self.notes = notes
|
||||
|
||||
def __repr__(self):
|
||||
return 'ReviewLine(features={}, notes={}, sent={})'.format(
|
||||
self.features, self.notes, self.sent
|
||||
)
|
||||
|
||||
|
||||
class ReviewsCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for the Customer Review Data dataset by Hu, Liu (2004).
|
||||
Note: we are not applying any sentence tokenization at the moment, just word
|
||||
tokenization.
|
||||
|
||||
>>> from nltk.corpus import product_reviews_1
|
||||
>>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
|
||||
>>> review = camera_reviews[0]
|
||||
>>> review.sents()[0]
|
||||
['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
|
||||
'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
|
||||
>>> review.features()
|
||||
[('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
|
||||
('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
|
||||
('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
|
||||
('option', '+1')]
|
||||
|
||||
We can also reach the same information directly from the stream:
|
||||
|
||||
>>> product_reviews_1.features('Canon_G3.txt')
|
||||
[('canon powershot g3', '+3'), ('use', '+2'), ...]
|
||||
|
||||
We can compute stats for specific product features:
|
||||
|
||||
>>> from __future__ import division
|
||||
>>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
|
||||
>>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
|
||||
>>> # We use float for backward compatibility with division in Python2.7
|
||||
>>> mean = tot / n_reviews
|
||||
>>> print(n_reviews, tot, mean)
|
||||
15 24 1.6
|
||||
"""
|
||||
|
||||
CorpusView = StreamBackedCorpusView
|
||||
|
||||
def __init__(
|
||||
self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding='utf8'
|
||||
):
|
||||
"""
|
||||
:param root: The root directory for the corpus.
|
||||
:param fileids: a list or regexp specifying the fileids in the corpus.
|
||||
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
|
||||
into words. Default: `WordPunctTokenizer`
|
||||
:param encoding: the encoding that should be used to read the corpus.
|
||||
"""
|
||||
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._word_tokenizer = word_tokenizer
|
||||
|
||||
def features(self, fileids=None):
|
||||
"""
|
||||
Return a list of features. Each feature is a tuple made of the specific
|
||||
item feature and the opinion strength about that feature.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
features have to be returned.
|
||||
:return: all features for the item(s) in the given file(s).
|
||||
:rtype: list(tuple)
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(fileid, self._read_features, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def raw(self, fileids=None):
|
||||
"""
|
||||
:param fileids: a list or regexp specifying the fileids of the files that
|
||||
have to be returned as a raw string.
|
||||
:return: the given file(s) as a single string.
|
||||
:rtype: str
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
def readme(self):
|
||||
"""
|
||||
Return the contents of the corpus README.txt file.
|
||||
"""
|
||||
return self.open("README.txt").read()
|
||||
|
||||
def reviews(self, fileids=None):
|
||||
"""
|
||||
Return all the reviews as a list of Review objects. If `fileids` is
|
||||
specified, return all the reviews from each of the specified files.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
reviews have to be returned.
|
||||
:return: the given file(s) as a list of reviews.
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(fileid, self._read_review_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
Return all sentences in the corpus or in the specified files.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
sentences have to be returned.
|
||||
:return: the given file(s) as a list of sentences, each encoded as a
|
||||
list of word strings.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
Return all words and punctuation symbols in the corpus or in the specified
|
||||
files.
|
||||
|
||||
:param fileids: a list or regexp specifying the ids of the files whose
|
||||
words have to be returned.
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_word_block, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def _read_features(self, stream):
|
||||
features = []
|
||||
for i in range(20):
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
return features
|
||||
features.extend(re.findall(FEATURES, line))
|
||||
return features
|
||||
|
||||
def _read_review_block(self, stream):
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
return [] # end of file.
|
||||
title_match = re.match(TITLE, line)
|
||||
if title_match:
|
||||
review = Review(
|
||||
title=title_match.group(1).strip()
|
||||
) # We create a new review
|
||||
break
|
||||
|
||||
# Scan until we find another line matching the regexp, or EOF.
|
||||
while True:
|
||||
oldpos = stream.tell()
|
||||
line = stream.readline()
|
||||
# End of file:
|
||||
if not line:
|
||||
return [review]
|
||||
# Start of a new review: backup to just before it starts, and
|
||||
# return the review we've already collected.
|
||||
if re.match(TITLE, line):
|
||||
stream.seek(oldpos)
|
||||
return [review]
|
||||
# Anything else is part of the review line.
|
||||
feats = re.findall(FEATURES, line)
|
||||
notes = re.findall(NOTES, line)
|
||||
sent = re.findall(SENT, line)
|
||||
if sent:
|
||||
sent = self._word_tokenizer.tokenize(sent[0])
|
||||
review_line = ReviewLine(sent=sent, features=feats, notes=notes)
|
||||
review.add_line(review_line)
|
||||
|
||||
def _read_sent_block(self, stream):
|
||||
sents = []
|
||||
for review in self._read_review_block(stream):
|
||||
sents.extend([sent for sent in review.sents()])
|
||||
return sents
|
||||
|
||||
def _read_word_block(self, stream):
|
||||
words = []
|
||||
for i in range(20): # Read 20 lines at a time.
|
||||
line = stream.readline()
|
||||
sent = re.findall(SENT, line)
|
||||
if sent:
|
||||
words.extend(self._word_tokenizer.tokenize(sent[0]))
|
||||
return words
|
||||
151
venv/lib/python3.7/site-packages/nltk/corpus/reader/rte.py
Normal file
151
venv/lib/python3.7/site-packages/nltk/corpus/reader/rte.py
Normal file
@@ -0,0 +1,151 @@
|
||||
# Natural Language Toolkit: RTE Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora.
|
||||
|
||||
The files were taken from the RTE1, RTE2 and RTE3 datasets and the files
|
||||
were regularized.
|
||||
|
||||
Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the
|
||||
gold standard annotated files.
|
||||
|
||||
Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following
|
||||
example is taken from RTE3::
|
||||
|
||||
<pair id="1" entailment="YES" task="IE" length="short" >
|
||||
|
||||
<t>The sale was made to pay Yukos' US$ 27.5 billion tax bill,
|
||||
Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known
|
||||
company Baikalfinansgroup which was later bought by the Russian
|
||||
state-owned oil company Rosneft .</t>
|
||||
|
||||
<h>Baikalfinansgroup was sold to Rosneft.</h>
|
||||
</pair>
|
||||
|
||||
In order to provide globally unique IDs for each pair, a new attribute
|
||||
``challenge`` has been added to the root element ``entailment-corpus`` of each
|
||||
file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
|
||||
challenge number and 'n' is the pair ID.
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk import compat
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.xmldocs import *
|
||||
|
||||
|
||||
def norm(value_string):
|
||||
"""
|
||||
Normalize the string value in an RTE pair's ``value`` or ``entailment``
|
||||
attribute as an integer (1, 0).
|
||||
|
||||
:param value_string: the label used to classify a text/hypothesis pair
|
||||
:type value_string: str
|
||||
:rtype: int
|
||||
"""
|
||||
|
||||
valdict = {"TRUE": 1, "FALSE": 0, "YES": 1, "NO": 0}
|
||||
return valdict[value_string.upper()]
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class RTEPair(object):
|
||||
"""
|
||||
Container for RTE text-hypothesis pairs.
|
||||
|
||||
The entailment relation is signalled by the ``value`` attribute in RTE1, and by
|
||||
``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment``
|
||||
attribute of this class.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pair,
|
||||
challenge=None,
|
||||
id=None,
|
||||
text=None,
|
||||
hyp=None,
|
||||
value=None,
|
||||
task=None,
|
||||
length=None,
|
||||
):
|
||||
"""
|
||||
:param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3)
|
||||
:param id: identifier for the pair
|
||||
:param text: the text component of the pair
|
||||
:param hyp: the hypothesis component of the pair
|
||||
:param value: classification label for the pair
|
||||
:param task: attribute for the particular NLP task that the data was drawn from
|
||||
:param length: attribute for the length of the text of the pair
|
||||
"""
|
||||
self.challenge = challenge
|
||||
self.id = pair.attrib["id"]
|
||||
self.gid = "%s-%s" % (self.challenge, self.id)
|
||||
self.text = pair[0].text
|
||||
self.hyp = pair[1].text
|
||||
|
||||
if "value" in pair.attrib:
|
||||
self.value = norm(pair.attrib["value"])
|
||||
elif "entailment" in pair.attrib:
|
||||
self.value = norm(pair.attrib["entailment"])
|
||||
else:
|
||||
self.value = value
|
||||
if "task" in pair.attrib:
|
||||
self.task = pair.attrib["task"]
|
||||
else:
|
||||
self.task = task
|
||||
if "length" in pair.attrib:
|
||||
self.length = pair.attrib["length"]
|
||||
else:
|
||||
self.length = length
|
||||
|
||||
def __repr__(self):
|
||||
if self.challenge:
|
||||
return '<RTEPair: gid=%s-%s>' % (self.challenge, self.id)
|
||||
else:
|
||||
return '<RTEPair: id=%s>' % self.id
|
||||
|
||||
|
||||
class RTECorpusReader(XMLCorpusReader):
|
||||
"""
|
||||
Corpus reader for corpora in RTE challenges.
|
||||
|
||||
This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected
|
||||
structure of input documents.
|
||||
"""
|
||||
|
||||
def _read_etree(self, doc):
|
||||
"""
|
||||
Map the XML input into an RTEPair.
|
||||
|
||||
This uses the ``getiterator()`` method from the ElementTree package to
|
||||
find all the ``<pair>`` elements.
|
||||
|
||||
:param doc: a parsed XML document
|
||||
:rtype: list(RTEPair)
|
||||
"""
|
||||
try:
|
||||
challenge = doc.attrib['challenge']
|
||||
except KeyError:
|
||||
challenge = None
|
||||
return [RTEPair(pair, challenge=challenge) for pair in doc.getiterator("pair")]
|
||||
|
||||
def pairs(self, fileids):
|
||||
"""
|
||||
Build a list of RTEPairs from a RTE corpus.
|
||||
|
||||
:param fileids: a list of RTE corpus fileids
|
||||
:type: list
|
||||
:rtype: list(RTEPair)
|
||||
"""
|
||||
if isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])
|
||||
297
venv/lib/python3.7/site-packages/nltk/corpus/reader/semcor.py
Normal file
297
venv/lib/python3.7/site-packages/nltk/corpus/reader/semcor.py
Normal file
@@ -0,0 +1,297 @@
|
||||
# Natural Language Toolkit: SemCor Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Nathan Schneider <nschneid@cs.cmu.edu>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Corpus reader for the SemCor Corpus.
|
||||
"""
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
__docformat__ = 'epytext en'
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
|
||||
from nltk.tree import Tree
|
||||
|
||||
|
||||
class SemcorCorpusReader(XMLCorpusReader):
|
||||
"""
|
||||
Corpus reader for the SemCor Corpus.
|
||||
For access to the complete XML data structure, use the ``xml()``
|
||||
method. For access to simple word lists and tagged word lists, use
|
||||
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, wordnet, lazy=True):
|
||||
XMLCorpusReader.__init__(self, root, fileids)
|
||||
self._lazy = lazy
|
||||
self._wordnet = wordnet
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of words and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return self._items(fileids, 'word', False, False, False)
|
||||
|
||||
def chunks(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of chunks,
|
||||
each of which is a list of words and punctuation symbols
|
||||
that form a unit.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return self._items(fileids, 'chunk', False, False, False)
|
||||
|
||||
def tagged_chunks(self, fileids=None, tag=('pos' or 'sem' or 'both')):
|
||||
"""
|
||||
:return: the given file(s) as a list of tagged chunks, represented
|
||||
in tree form.
|
||||
:rtype: list(Tree)
|
||||
|
||||
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
|
||||
to indicate the kind of tags to include. Semantic tags consist of
|
||||
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
|
||||
without a specific entry in WordNet. (Named entities of type 'other'
|
||||
have no lemma. Other chunks not in WordNet have no semantic tag.
|
||||
Punctuation tokens have `None` for their part of speech tag.)
|
||||
"""
|
||||
return self._items(fileids, 'chunk', False, tag != 'sem', tag != 'pos')
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of sentences, each encoded
|
||||
as a list of word strings.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return self._items(fileids, 'word', True, False, False)
|
||||
|
||||
def chunk_sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of sentences, each encoded
|
||||
as a list of chunks.
|
||||
:rtype: list(list(list(str)))
|
||||
"""
|
||||
return self._items(fileids, 'chunk', True, False, False)
|
||||
|
||||
def tagged_sents(self, fileids=None, tag=('pos' or 'sem' or 'both')):
|
||||
"""
|
||||
:return: the given file(s) as a list of sentences. Each sentence
|
||||
is represented as a list of tagged chunks (in tree form).
|
||||
:rtype: list(list(Tree))
|
||||
|
||||
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
|
||||
to indicate the kind of tags to include. Semantic tags consist of
|
||||
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
|
||||
without a specific entry in WordNet. (Named entities of type 'other'
|
||||
have no lemma. Other chunks not in WordNet have no semantic tag.
|
||||
Punctuation tokens have `None` for their part of speech tag.)
|
||||
"""
|
||||
return self._items(fileids, 'chunk', True, tag != 'sem', tag != 'pos')
|
||||
|
||||
def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
|
||||
if unit == 'word' and not bracket_sent:
|
||||
# the result of the SemcorWordView may be a multiword unit, so the
|
||||
# LazyConcatenation will make sure the sentence is flattened
|
||||
_ = lambda *args: LazyConcatenation(
|
||||
(SemcorWordView if self._lazy else self._words)(*args)
|
||||
)
|
||||
else:
|
||||
_ = SemcorWordView if self._lazy else self._words
|
||||
return concat(
|
||||
[
|
||||
_(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
|
||||
for fileid in self.abspaths(fileids)
|
||||
]
|
||||
)
|
||||
|
||||
def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
|
||||
"""
|
||||
Helper used to implement the view methods -- returns a list of
|
||||
tokens, (segmented) words, chunks, or sentences. The tokens
|
||||
and chunks may optionally be tagged (with POS and sense
|
||||
information).
|
||||
|
||||
:param fileid: The name of the underlying file.
|
||||
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
|
||||
:param bracket_sent: If true, include sentence bracketing.
|
||||
:param pos_tag: Whether to include part-of-speech tags.
|
||||
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
|
||||
and OOV named entity status.
|
||||
"""
|
||||
assert unit in ('token', 'word', 'chunk')
|
||||
result = []
|
||||
|
||||
xmldoc = ElementTree.parse(fileid).getroot()
|
||||
for xmlsent in xmldoc.findall('.//s'):
|
||||
sent = []
|
||||
for xmlword in _all_xmlwords_in(xmlsent):
|
||||
itm = SemcorCorpusReader._word(
|
||||
xmlword, unit, pos_tag, sem_tag, self._wordnet
|
||||
)
|
||||
if unit == 'word':
|
||||
sent.extend(itm)
|
||||
else:
|
||||
sent.append(itm)
|
||||
|
||||
if bracket_sent:
|
||||
result.append(SemcorSentence(xmlsent.attrib['snum'], sent))
|
||||
else:
|
||||
result.extend(sent)
|
||||
|
||||
assert None not in result
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
|
||||
tkn = xmlword.text
|
||||
if not tkn:
|
||||
tkn = "" # fixes issue 337?
|
||||
|
||||
lemma = xmlword.get('lemma', tkn) # lemma or NE class
|
||||
lexsn = xmlword.get('lexsn') # lex_sense (locator for the lemma's sense)
|
||||
if lexsn is not None:
|
||||
sense_key = lemma + '%' + lexsn
|
||||
wnpos = ('n', 'v', 'a', 'r', 's')[
|
||||
int(lexsn.split(':')[0]) - 1
|
||||
] # see http://wordnet.princeton.edu/man/senseidx.5WN.html
|
||||
else:
|
||||
sense_key = wnpos = None
|
||||
redef = xmlword.get(
|
||||
'rdf', tkn
|
||||
) # redefinition--this indicates the lookup string
|
||||
# does not exactly match the enclosed string, e.g. due to typographical adjustments
|
||||
# or discontinuity of a multiword expression. If a redefinition has occurred,
|
||||
# the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
|
||||
# For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
|
||||
sensenum = xmlword.get('wnsn') # WordNet sense number
|
||||
isOOVEntity = 'pn' in xmlword.keys() # a "personal name" (NE) not in WordNet
|
||||
pos = xmlword.get(
|
||||
'pos'
|
||||
) # part of speech for the whole chunk (None for punctuation)
|
||||
|
||||
if unit == 'token':
|
||||
if not pos_tag and not sem_tag:
|
||||
itm = tkn
|
||||
else:
|
||||
itm = (
|
||||
(tkn,)
|
||||
+ ((pos,) if pos_tag else ())
|
||||
+ ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
|
||||
)
|
||||
return itm
|
||||
else:
|
||||
ww = tkn.split('_') # TODO: case where punctuation intervenes in MWE
|
||||
if unit == 'word':
|
||||
return ww
|
||||
else:
|
||||
if sensenum is not None:
|
||||
try:
|
||||
sense = wordnet.lemma_from_key(sense_key) # Lemma object
|
||||
except Exception:
|
||||
# cannot retrieve the wordnet.Lemma object. possible reasons:
|
||||
# (a) the wordnet corpus is not downloaded;
|
||||
# (b) a nonexistant sense is annotated: e.g., such.s.00 triggers:
|
||||
# nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
|
||||
# solution: just use the lemma name as a string
|
||||
try:
|
||||
sense = '%s.%s.%02d' % (
|
||||
lemma,
|
||||
wnpos,
|
||||
int(sensenum),
|
||||
) # e.g.: reach.v.02
|
||||
except ValueError:
|
||||
sense = (
|
||||
lemma + '.' + wnpos + '.' + sensenum
|
||||
) # e.g. the sense number may be "2;1"
|
||||
|
||||
bottom = [Tree(pos, ww)] if pos_tag else ww
|
||||
|
||||
if sem_tag and isOOVEntity:
|
||||
if sensenum is not None:
|
||||
return Tree(sense, [Tree('NE', bottom)])
|
||||
else: # 'other' NE
|
||||
return Tree('NE', bottom)
|
||||
elif sem_tag and sensenum is not None:
|
||||
return Tree(sense, bottom)
|
||||
elif pos_tag:
|
||||
return bottom[0]
|
||||
else:
|
||||
return bottom # chunk as a list
|
||||
|
||||
|
||||
def _all_xmlwords_in(elt, result=None):
|
||||
if result is None:
|
||||
result = []
|
||||
for child in elt:
|
||||
if child.tag in ('wf', 'punc'):
|
||||
result.append(child)
|
||||
else:
|
||||
_all_xmlwords_in(child, result)
|
||||
return result
|
||||
|
||||
|
||||
class SemcorSentence(list):
|
||||
"""
|
||||
A list of words, augmented by an attribute ``num`` used to record
|
||||
the sentence identifier (the ``n`` attribute from the XML).
|
||||
"""
|
||||
|
||||
def __init__(self, num, items):
|
||||
self.num = num
|
||||
list.__init__(self, items)
|
||||
|
||||
|
||||
class SemcorWordView(XMLCorpusView):
|
||||
"""
|
||||
A stream backed corpus view specialized for use with the BNC corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
|
||||
"""
|
||||
:param fileid: The name of the underlying file.
|
||||
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
|
||||
:param bracket_sent: If true, include sentence bracketing.
|
||||
:param pos_tag: Whether to include part-of-speech tags.
|
||||
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
|
||||
and OOV named entity status.
|
||||
"""
|
||||
if bracket_sent:
|
||||
tagspec = '.*/s'
|
||||
else:
|
||||
tagspec = '.*/s/(punc|wf)'
|
||||
|
||||
self._unit = unit
|
||||
self._sent = bracket_sent
|
||||
self._pos_tag = pos_tag
|
||||
self._sem_tag = sem_tag
|
||||
self._wordnet = wordnet
|
||||
|
||||
XMLCorpusView.__init__(self, fileid, tagspec)
|
||||
|
||||
def handle_elt(self, elt, context):
|
||||
if self._sent:
|
||||
return self.handle_sent(elt)
|
||||
else:
|
||||
return self.handle_word(elt)
|
||||
|
||||
def handle_word(self, elt):
|
||||
return SemcorCorpusReader._word(
|
||||
elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet
|
||||
)
|
||||
|
||||
def handle_sent(self, elt):
|
||||
sent = []
|
||||
for child in elt:
|
||||
if child.tag in ('wf', 'punc'):
|
||||
itm = self.handle_word(child)
|
||||
if self._unit == 'word':
|
||||
sent.extend(itm)
|
||||
else:
|
||||
sent.append(itm)
|
||||
else:
|
||||
raise ValueError('Unexpected element %s' % child.tag)
|
||||
return SemcorSentence(elt.attrib['snum'], sent)
|
||||
212
venv/lib/python3.7/site-packages/nltk/corpus/reader/senseval.py
Normal file
212
venv/lib/python3.7/site-packages/nltk/corpus/reader/senseval.py
Normal file
@@ -0,0 +1,212 @@
|
||||
# Natural Language Toolkit: Senseval 2 Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
||||
# Steven Bird <stevenbird1@gmail.com> (modifications)
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Read from the Senseval 2 Corpus.
|
||||
|
||||
SENSEVAL [http://www.senseval.org/]
|
||||
Evaluation exercises for Word Sense Disambiguation.
|
||||
Organized by ACL-SIGLEX [http://www.siglex.org/]
|
||||
|
||||
Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
|
||||
http://www.d.umn.edu/~tpederse/data.html
|
||||
Distributed with permission.
|
||||
|
||||
The NLTK version of the Senseval 2 files uses well-formed XML.
|
||||
Each instance of the ambiguous words "hard", "interest", "line", and "serve"
|
||||
is tagged with a sense identifier, and supplied with context.
|
||||
"""
|
||||
from __future__ import print_function, unicode_literals
|
||||
|
||||
import re
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk import compat
|
||||
from nltk.tokenize import *
|
||||
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class SensevalInstance(object):
|
||||
def __init__(self, word, position, context, senses):
|
||||
self.word = word
|
||||
self.senses = tuple(senses)
|
||||
self.position = position
|
||||
self.context = context
|
||||
|
||||
def __repr__(self):
|
||||
return 'SensevalInstance(word=%r, position=%r, ' 'context=%r, senses=%r)' % (
|
||||
self.word,
|
||||
self.position,
|
||||
self.context,
|
||||
self.senses,
|
||||
)
|
||||
|
||||
|
||||
class SensevalCorpusReader(CorpusReader):
|
||||
def instances(self, fileids=None):
|
||||
return concat(
|
||||
[
|
||||
SensevalCorpusView(fileid, enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def raw(self, fileids=None):
|
||||
"""
|
||||
:return: the text contents of the given fileids, as a single string.
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
def _entry(self, tree):
|
||||
elts = []
|
||||
for lexelt in tree.findall('lexelt'):
|
||||
for inst in lexelt.findall('instance'):
|
||||
sense = inst[0].attrib['senseid']
|
||||
context = [(w.text, w.attrib['pos']) for w in inst[1]]
|
||||
elts.append((sense, context))
|
||||
return elts
|
||||
|
||||
|
||||
class SensevalCorpusView(StreamBackedCorpusView):
|
||||
def __init__(self, fileid, encoding):
|
||||
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
|
||||
|
||||
self._word_tokenizer = WhitespaceTokenizer()
|
||||
self._lexelt_starts = [0] # list of streampos
|
||||
self._lexelts = [None] # list of lexelt names
|
||||
|
||||
def read_block(self, stream):
|
||||
# Decide which lexical element we're in.
|
||||
lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1
|
||||
lexelt = self._lexelts[lexelt_num]
|
||||
|
||||
instance_lines = []
|
||||
in_instance = False
|
||||
while True:
|
||||
line = stream.readline()
|
||||
if line == '':
|
||||
assert instance_lines == []
|
||||
return []
|
||||
|
||||
# Start of a lexical element?
|
||||
if line.lstrip().startswith('<lexelt'):
|
||||
lexelt_num += 1
|
||||
m = re.search('item=("[^"]+"|\'[^\']+\')', line)
|
||||
assert m is not None # <lexelt> has no 'item=...'
|
||||
lexelt = m.group(1)[1:-1]
|
||||
if lexelt_num < len(self._lexelts):
|
||||
assert lexelt == self._lexelts[lexelt_num]
|
||||
else:
|
||||
self._lexelts.append(lexelt)
|
||||
self._lexelt_starts.append(stream.tell())
|
||||
|
||||
# Start of an instance?
|
||||
if line.lstrip().startswith('<instance'):
|
||||
assert instance_lines == []
|
||||
in_instance = True
|
||||
|
||||
# Body of an instance?
|
||||
if in_instance:
|
||||
instance_lines.append(line)
|
||||
|
||||
# End of an instance?
|
||||
if line.lstrip().startswith('</instance'):
|
||||
xml_block = '\n'.join(instance_lines)
|
||||
xml_block = _fixXML(xml_block)
|
||||
inst = ElementTree.fromstring(xml_block)
|
||||
return [self._parse_instance(inst, lexelt)]
|
||||
|
||||
def _parse_instance(self, instance, lexelt):
|
||||
senses = []
|
||||
context = []
|
||||
position = None
|
||||
for child in instance:
|
||||
if child.tag == 'answer':
|
||||
senses.append(child.attrib['senseid'])
|
||||
elif child.tag == 'context':
|
||||
context += self._word_tokenizer.tokenize(child.text)
|
||||
for cword in child:
|
||||
if cword.tag == 'compound':
|
||||
cword = cword[0] # is this ok to do?
|
||||
|
||||
if cword.tag == 'head':
|
||||
# Some santiy checks:
|
||||
assert position is None, 'head specified twice'
|
||||
assert cword.text.strip() or len(cword) == 1
|
||||
assert not (cword.text.strip() and len(cword) == 1)
|
||||
# Record the position of the head:
|
||||
position = len(context)
|
||||
# Addd on the head word itself:
|
||||
if cword.text.strip():
|
||||
context.append(cword.text.strip())
|
||||
elif cword[0].tag == 'wf':
|
||||
context.append((cword[0].text, cword[0].attrib['pos']))
|
||||
if cword[0].tail:
|
||||
context += self._word_tokenizer.tokenize(cword[0].tail)
|
||||
else:
|
||||
assert False, 'expected CDATA or wf in <head>'
|
||||
elif cword.tag == 'wf':
|
||||
context.append((cword.text, cword.attrib['pos']))
|
||||
elif cword.tag == 's':
|
||||
pass # Sentence boundary marker.
|
||||
|
||||
else:
|
||||
print('ACK', cword.tag)
|
||||
assert False, 'expected CDATA or <wf> or <head>'
|
||||
if cword.tail:
|
||||
context += self._word_tokenizer.tokenize(cword.tail)
|
||||
else:
|
||||
assert False, 'unexpected tag %s' % child.tag
|
||||
return SensevalInstance(lexelt, position, context, senses)
|
||||
|
||||
|
||||
def _fixXML(text):
|
||||
"""
|
||||
Fix the various issues with Senseval pseudo-XML.
|
||||
"""
|
||||
# <~> or <^> => ~ or ^
|
||||
text = re.sub(r'<([~\^])>', r'\1', text)
|
||||
# fix lone &
|
||||
text = re.sub(r'(\s+)\&(\s+)', r'\1&\2', text)
|
||||
# fix """
|
||||
text = re.sub(r'"""', '\'"\'', text)
|
||||
# fix <s snum=dd> => <s snum="dd"/>
|
||||
text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
|
||||
# fix foreign word tag
|
||||
text = re.sub(r'<\&frasl>\s*<p[^>]*>', 'FRASL', text)
|
||||
# remove <&I .>
|
||||
text = re.sub(r'<\&I[^>]*>', '', text)
|
||||
# fix <{word}>
|
||||
text = re.sub(r'<{([^}]+)}>', r'\1', text)
|
||||
# remove <@>, <p>, </p>
|
||||
text = re.sub(r'<(@|/?p)>', r'', text)
|
||||
# remove <&M .> and <&T .> and <&Ms .>
|
||||
text = re.sub(r'<&\w+ \.>', r'', text)
|
||||
# remove <!DOCTYPE... > lines
|
||||
text = re.sub(r'<!DOCTYPE[^>]*>', r'', text)
|
||||
# remove <[hi]> and <[/p]> etc
|
||||
text = re.sub(r'<\[\/?[^>]+\]*>', r'', text)
|
||||
# take the thing out of the brackets: <…>
|
||||
text = re.sub(r'<(\&\w+;)>', r'\1', text)
|
||||
# and remove the & for those patterns that aren't regular XML
|
||||
text = re.sub(r'&(?!amp|gt|lt|apos|quot)', r'', text)
|
||||
# fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf>
|
||||
text = re.sub(
|
||||
r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', r' <wf pos="\2">\1</wf>', text
|
||||
)
|
||||
text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)
|
||||
return text
|
||||
@@ -0,0 +1,139 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Natural Language Toolkit: SentiWordNet
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Christopher Potts <cgpotts@stanford.edu>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
An NLTK interface for SentiWordNet
|
||||
|
||||
SentiWordNet is a lexical resource for opinion mining.
|
||||
SentiWordNet assigns to each synset of WordNet three
|
||||
sentiment scores: positivity, negativity, and objectivity.
|
||||
|
||||
For details about SentiWordNet see:
|
||||
http://sentiwordnet.isti.cnr.it/
|
||||
|
||||
>>> from nltk.corpus import sentiwordnet as swn
|
||||
>>> print(swn.senti_synset('breakdown.n.03'))
|
||||
<breakdown.n.03: PosScore=0.0 NegScore=0.25>
|
||||
>>> list(swn.senti_synsets('slow'))
|
||||
[SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),
|
||||
SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),
|
||||
SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),
|
||||
SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),
|
||||
SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'),
|
||||
SentiSynset('behind.r.03')]
|
||||
>>> happy = swn.senti_synsets('happy', 'a')
|
||||
>>> happy0 = list(happy)[0]
|
||||
>>> happy0.pos_score()
|
||||
0.875
|
||||
>>> happy0.neg_score()
|
||||
0.0
|
||||
>>> happy0.obj_score()
|
||||
0.125
|
||||
"""
|
||||
|
||||
import re
|
||||
from nltk.compat import python_2_unicode_compatible
|
||||
from nltk.corpus.reader import CorpusReader
|
||||
|
||||
|
||||
@python_2_unicode_compatible
|
||||
class SentiWordNetCorpusReader(CorpusReader):
|
||||
def __init__(self, root, fileids, encoding='utf-8'):
|
||||
"""
|
||||
Construct a new SentiWordNet Corpus Reader, using data from
|
||||
the specified file.
|
||||
"""
|
||||
super(SentiWordNetCorpusReader, self).__init__(root, fileids, encoding=encoding)
|
||||
if len(self._fileids) != 1:
|
||||
raise ValueError('Exactly one file must be specified')
|
||||
self._db = {}
|
||||
self._parse_src_file()
|
||||
|
||||
def _parse_src_file(self):
|
||||
lines = self.open(self._fileids[0]).read().splitlines()
|
||||
lines = filter((lambda x: not re.search(r"^\s*#", x)), lines)
|
||||
for i, line in enumerate(lines):
|
||||
fields = [field.strip() for field in re.split(r"\t+", line)]
|
||||
try:
|
||||
pos, offset, pos_score, neg_score, synset_terms, gloss = fields
|
||||
except:
|
||||
raise ValueError('Line %s formatted incorrectly: %s\n' % (i, line))
|
||||
if pos and offset:
|
||||
offset = int(offset)
|
||||
self._db[(pos, offset)] = (float(pos_score), float(neg_score))
|
||||
|
||||
def senti_synset(self, *vals):
|
||||
from nltk.corpus import wordnet as wn
|
||||
|
||||
if tuple(vals) in self._db:
|
||||
pos_score, neg_score = self._db[tuple(vals)]
|
||||
pos, offset = vals
|
||||
if pos == 's':
|
||||
pos = 'a'
|
||||
synset = wn.synset_from_pos_and_offset(pos, offset)
|
||||
return SentiSynset(pos_score, neg_score, synset)
|
||||
else:
|
||||
synset = wn.synset(vals[0])
|
||||
pos = synset.pos()
|
||||
if pos == 's':
|
||||
pos = 'a'
|
||||
offset = synset.offset()
|
||||
if (pos, offset) in self._db:
|
||||
pos_score, neg_score = self._db[(pos, offset)]
|
||||
return SentiSynset(pos_score, neg_score, synset)
|
||||
else:
|
||||
return None
|
||||
|
||||
def senti_synsets(self, string, pos=None):
|
||||
from nltk.corpus import wordnet as wn
|
||||
|
||||
sentis = []
|
||||
synset_list = wn.synsets(string, pos)
|
||||
for synset in synset_list:
|
||||
sentis.append(self.senti_synset(synset.name()))
|
||||
sentis = filter(lambda x: x, sentis)
|
||||
return sentis
|
||||
|
||||
def all_senti_synsets(self):
|
||||
from nltk.corpus import wordnet as wn
|
||||
|
||||
for key, fields in self._db.items():
|
||||
pos, offset = key
|
||||
pos_score, neg_score = fields
|
||||
synset = wn.synset_from_pos_and_offset(pos, offset)
|
||||
yield SentiSynset(pos_score, neg_score, synset)
|
||||
|
||||
|
||||
@python_2_unicode_compatible
|
||||
class SentiSynset(object):
|
||||
def __init__(self, pos_score, neg_score, synset):
|
||||
self._pos_score = pos_score
|
||||
self._neg_score = neg_score
|
||||
self._obj_score = 1.0 - (self._pos_score + self._neg_score)
|
||||
self.synset = synset
|
||||
|
||||
def pos_score(self):
|
||||
return self._pos_score
|
||||
|
||||
def neg_score(self):
|
||||
return self._neg_score
|
||||
|
||||
def obj_score(self):
|
||||
return self._obj_score
|
||||
|
||||
def __str__(self):
|
||||
"""Prints just the Pos/Neg scores for now."""
|
||||
s = "<"
|
||||
s += self.synset.name() + ": "
|
||||
s += "PosScore=%s " % self._pos_score
|
||||
s += "NegScore=%s" % self._neg_score
|
||||
s += ">"
|
||||
return s
|
||||
|
||||
def __repr__(self):
|
||||
return "Senti" + repr(self.synset)
|
||||
@@ -0,0 +1,76 @@
|
||||
# Natural Language Toolkit: Sinica Treebank Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Sinica Treebank Corpus Sample
|
||||
|
||||
http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm
|
||||
|
||||
10,000 parsed sentences, drawn from the Academia Sinica Balanced
|
||||
Corpus of Modern Chinese. Parse tree notation is based on
|
||||
Information-based Case Grammar. Tagset documentation is available
|
||||
at http://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html
|
||||
|
||||
Language and Knowledge Processing Group, Institute of Information
|
||||
Science, Academia Sinica
|
||||
|
||||
The data is distributed with the Natural Language Toolkit under the terms of
|
||||
the Creative Commons Attribution-NonCommercial-ShareAlike License
|
||||
[http://creativecommons.org/licenses/by-nc-sa/2.5/].
|
||||
|
||||
References:
|
||||
|
||||
Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999)
|
||||
The Construction of Sinica Treebank. Computational Linguistics and
|
||||
Chinese Language Processing, 4, pp 87-104.
|
||||
|
||||
Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming
|
||||
Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria,
|
||||
Annotation Guidelines, and On-line Interface. Proceedings of 2nd
|
||||
Chinese Language Processing Workshop, Association for Computational
|
||||
Linguistics.
|
||||
|
||||
Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar
|
||||
Extraction, Proceedings of IJCNLP-04, pp560-565.
|
||||
"""
|
||||
|
||||
from nltk.tree import sinica_parse
|
||||
from nltk.tag import map_tag
|
||||
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
IDENTIFIER = re.compile(r'^#\S+\s')
|
||||
APPENDIX = re.compile(r'(?<=\))#.*$')
|
||||
TAGWORD = re.compile(r':([^:()|]+):([^:()|]+)')
|
||||
WORD = re.compile(r':[^:()|]+:([^:()|]+)')
|
||||
|
||||
|
||||
class SinicaTreebankCorpusReader(SyntaxCorpusReader):
|
||||
"""
|
||||
Reader for the sinica treebank.
|
||||
"""
|
||||
|
||||
def _read_block(self, stream):
|
||||
sent = stream.readline()
|
||||
sent = IDENTIFIER.sub('', sent)
|
||||
sent = APPENDIX.sub('', sent)
|
||||
return [sent]
|
||||
|
||||
def _parse(self, sent):
|
||||
return sinica_parse(sent)
|
||||
|
||||
def _tag(self, sent, tagset=None):
|
||||
tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)]
|
||||
if tagset and tagset != self._tagset:
|
||||
tagged_sent = [
|
||||
(w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent
|
||||
]
|
||||
return tagged_sent
|
||||
|
||||
def _word(self, sent):
|
||||
return WORD.findall(sent)
|
||||
@@ -0,0 +1,67 @@
|
||||
# Natural Language Toolkit: String Category Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# Edward Loper <edloper@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Read tuples from a corpus consisting of categorized strings.
|
||||
For example, from the question classification corpus:
|
||||
|
||||
NUM:dist How far is it from Denver to Aspen ?
|
||||
LOC:city What county is Modesto , California in ?
|
||||
HUM:desc Who was Galileo ?
|
||||
DESC:def What is an atom ?
|
||||
NUM:date When did Hawaii become a state ?
|
||||
"""
|
||||
|
||||
# based on PPAttachmentCorpusReader
|
||||
from six import string_types
|
||||
|
||||
from nltk import compat
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
# [xx] Should the order of the tuple be reversed -- in most other places
|
||||
# in nltk, we use the form (data, tag) -- e.g., tagged words and
|
||||
# labeled texts for classifiers.
|
||||
class StringCategoryCorpusReader(CorpusReader):
|
||||
def __init__(self, root, fileids, delimiter=' ', encoding='utf8'):
|
||||
"""
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
:param delimiter: Field delimiter
|
||||
"""
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._delimiter = delimiter
|
||||
|
||||
def tuples(self, fileids=None):
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat(
|
||||
[
|
||||
StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def raw(self, fileids=None):
|
||||
"""
|
||||
:return: the text contents of the given fileids, as a single string.
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
def _read_tuple_block(self, stream):
|
||||
line = stream.readline().strip()
|
||||
if line:
|
||||
return [tuple(line.split(self._delimiter, 1))]
|
||||
else:
|
||||
return []
|
||||
@@ -0,0 +1,129 @@
|
||||
# Natural Language Toolkit: Switchboard Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
from __future__ import unicode_literals
|
||||
import re
|
||||
|
||||
from nltk.tag import str2tuple, map_tag
|
||||
from nltk import compat
|
||||
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class SwitchboardTurn(list):
|
||||
"""
|
||||
A specialized list object used to encode switchboard utterances.
|
||||
The elements of the list are the words in the utterance; and two
|
||||
attributes, ``speaker`` and ``id``, are provided to retrieve the
|
||||
spearker identifier and utterance id. Note that utterance ids
|
||||
are only unique within a given discourse.
|
||||
"""
|
||||
|
||||
def __init__(self, words, speaker, id):
|
||||
list.__init__(self, words)
|
||||
self.speaker = speaker
|
||||
self.id = int(id)
|
||||
|
||||
def __repr__(self):
|
||||
if len(self) == 0:
|
||||
text = ''
|
||||
elif isinstance(self[0], tuple):
|
||||
text = ' '.join('%s/%s' % w for w in self)
|
||||
else:
|
||||
text = ' '.join(self)
|
||||
return '<%s.%s: %r>' % (self.speaker, self.id, text)
|
||||
|
||||
|
||||
class SwitchboardCorpusReader(CorpusReader):
|
||||
_FILES = ['tagged']
|
||||
# Use the "tagged" file even for non-tagged data methods, since
|
||||
# it's tokenized.
|
||||
|
||||
def __init__(self, root, tagset=None):
|
||||
CorpusReader.__init__(self, root, self._FILES)
|
||||
self._tagset = tagset
|
||||
|
||||
def words(self):
|
||||
return StreamBackedCorpusView(self.abspath('tagged'), self._words_block_reader)
|
||||
|
||||
def tagged_words(self, tagset=None):
|
||||
def tagged_words_block_reader(stream):
|
||||
return self._tagged_words_block_reader(stream, tagset)
|
||||
|
||||
return StreamBackedCorpusView(self.abspath('tagged'), tagged_words_block_reader)
|
||||
|
||||
def turns(self):
|
||||
return StreamBackedCorpusView(self.abspath('tagged'), self._turns_block_reader)
|
||||
|
||||
def tagged_turns(self, tagset=None):
|
||||
def tagged_turns_block_reader(stream):
|
||||
return self._tagged_turns_block_reader(stream, tagset)
|
||||
|
||||
return StreamBackedCorpusView(self.abspath('tagged'), tagged_turns_block_reader)
|
||||
|
||||
def discourses(self):
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath('tagged'), self._discourses_block_reader
|
||||
)
|
||||
|
||||
def tagged_discourses(self, tagset=False):
|
||||
def tagged_discourses_block_reader(stream):
|
||||
return self._tagged_discourses_block_reader(stream, tagset)
|
||||
|
||||
return StreamBackedCorpusView(
|
||||
self.abspath('tagged'), tagged_discourses_block_reader
|
||||
)
|
||||
|
||||
def _discourses_block_reader(self, stream):
|
||||
# returns at most 1 discourse. (The other methods depend on this.)
|
||||
return [
|
||||
[
|
||||
self._parse_utterance(u, include_tag=False)
|
||||
for b in read_blankline_block(stream)
|
||||
for u in b.split('\n')
|
||||
if u.strip()
|
||||
]
|
||||
]
|
||||
|
||||
def _tagged_discourses_block_reader(self, stream, tagset=None):
|
||||
# returns at most 1 discourse. (The other methods depend on this.)
|
||||
return [
|
||||
[
|
||||
self._parse_utterance(u, include_tag=True, tagset=tagset)
|
||||
for b in read_blankline_block(stream)
|
||||
for u in b.split('\n')
|
||||
if u.strip()
|
||||
]
|
||||
]
|
||||
|
||||
def _turns_block_reader(self, stream):
|
||||
return self._discourses_block_reader(stream)[0]
|
||||
|
||||
def _tagged_turns_block_reader(self, stream, tagset=None):
|
||||
return self._tagged_discourses_block_reader(stream, tagset)[0]
|
||||
|
||||
def _words_block_reader(self, stream):
|
||||
return sum(self._discourses_block_reader(stream)[0], [])
|
||||
|
||||
def _tagged_words_block_reader(self, stream, tagset=None):
|
||||
return sum(self._tagged_discourses_block_reader(stream, tagset)[0], [])
|
||||
|
||||
_UTTERANCE_RE = re.compile('(\w+)\.(\d+)\:\s*(.*)')
|
||||
_SEP = '/'
|
||||
|
||||
def _parse_utterance(self, utterance, include_tag, tagset=None):
|
||||
m = self._UTTERANCE_RE.match(utterance)
|
||||
if m is None:
|
||||
raise ValueError('Bad utterance %r' % utterance)
|
||||
speaker, id, text = m.groups()
|
||||
words = [str2tuple(s, self._SEP) for s in text.split()]
|
||||
if not include_tag:
|
||||
words = [w for (w, t) in words]
|
||||
elif tagset and tagset != self._tagset:
|
||||
words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words]
|
||||
return SwitchboardTurn(words, speaker, id)
|
||||
394
venv/lib/python3.7/site-packages/nltk/corpus/reader/tagged.py
Normal file
394
venv/lib/python3.7/site-packages/nltk/corpus/reader/tagged.py
Normal file
@@ -0,0 +1,394 @@
|
||||
# Natural Language Toolkit: Tagged Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Jacob Perkins <japerk@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A reader for corpora whose documents contain part-of-speech-tagged words.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk.tag import str2tuple, map_tag
|
||||
from nltk.tokenize import *
|
||||
|
||||
from nltk.corpus.reader.api import *
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.timit import read_timit_block
|
||||
|
||||
|
||||
class TaggedCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for simple part-of-speech tagged corpora. Paragraphs are
|
||||
assumed to be split using blank lines. Sentences and words can be
|
||||
tokenized using the default tokenizers, or by custom tokenizers
|
||||
specified as parameters to the constructor. Words are parsed
|
||||
using ``nltk.tag.str2tuple``. By default, ``'/'`` is used as the
|
||||
separator. I.e., words should have the form::
|
||||
|
||||
word1/tag1 word2/tag2 word3/tag3 ...
|
||||
|
||||
But custom separators may be specified as parameters to the
|
||||
constructor. Part of speech tags are case-normalized to upper
|
||||
case.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
sep='/',
|
||||
word_tokenizer=WhitespaceTokenizer(),
|
||||
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
|
||||
para_block_reader=read_blankline_block,
|
||||
encoding='utf8',
|
||||
tagset=None,
|
||||
):
|
||||
"""
|
||||
Construct a new Tagged Corpus reader for a set of documents
|
||||
located at the given root directory. Example usage:
|
||||
|
||||
>>> root = '/...path to corpus.../'
|
||||
>>> reader = TaggedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
|
||||
|
||||
:param root: The root directory for this corpus.
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
"""
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
self._sep = sep
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._para_block_reader = para_block_reader
|
||||
self._tagset = tagset
|
||||
|
||||
def raw(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a single string.
|
||||
:rtype: str
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
def words(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of words
|
||||
and punctuation symbols.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
TaggedCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
self._sep,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._para_block_reader,
|
||||
None,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def sents(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences or utterances, each encoded as a list of word
|
||||
strings.
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
TaggedCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
False,
|
||||
True,
|
||||
False,
|
||||
self._sep,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._para_block_reader,
|
||||
None,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def paras(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
paragraphs, each encoded as a list of sentences, which are
|
||||
in turn encoded as lists of word strings.
|
||||
:rtype: list(list(list(str)))
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
TaggedCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
False,
|
||||
True,
|
||||
True,
|
||||
self._sep,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._para_block_reader,
|
||||
None,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_words(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of tagged
|
||||
words and punctuation symbols, encoded as tuples
|
||||
``(word,tag)``.
|
||||
:rtype: list(tuple(str,str))
|
||||
"""
|
||||
if tagset and tagset != self._tagset:
|
||||
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
|
||||
else:
|
||||
tag_mapping_function = None
|
||||
return concat(
|
||||
[
|
||||
TaggedCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
True,
|
||||
False,
|
||||
False,
|
||||
self._sep,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._para_block_reader,
|
||||
tag_mapping_function,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
sentences, each encoded as a list of ``(word,tag)`` tuples.
|
||||
|
||||
:rtype: list(list(tuple(str,str)))
|
||||
"""
|
||||
if tagset and tagset != self._tagset:
|
||||
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
|
||||
else:
|
||||
tag_mapping_function = None
|
||||
return concat(
|
||||
[
|
||||
TaggedCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
True,
|
||||
True,
|
||||
False,
|
||||
self._sep,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._para_block_reader,
|
||||
tag_mapping_function,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def tagged_paras(self, fileids=None, tagset=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of
|
||||
paragraphs, each encoded as a list of sentences, which are
|
||||
in turn encoded as lists of ``(word,tag)`` tuples.
|
||||
:rtype: list(list(list(tuple(str,str))))
|
||||
"""
|
||||
if tagset and tagset != self._tagset:
|
||||
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
|
||||
else:
|
||||
tag_mapping_function = None
|
||||
return concat(
|
||||
[
|
||||
TaggedCorpusView(
|
||||
fileid,
|
||||
enc,
|
||||
True,
|
||||
True,
|
||||
True,
|
||||
self._sep,
|
||||
self._word_tokenizer,
|
||||
self._sent_tokenizer,
|
||||
self._para_block_reader,
|
||||
tag_mapping_function,
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class CategorizedTaggedCorpusReader(CategorizedCorpusReader, TaggedCorpusReader):
|
||||
"""
|
||||
A reader for part-of-speech tagged corpora whose documents are
|
||||
divided into categories based on their file identifiers.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""
|
||||
Initialize the corpus reader. Categorization arguments
|
||||
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
|
||||
the ``CategorizedCorpusReader`` constructor. The remaining arguments
|
||||
are passed to the ``TaggedCorpusReader``.
|
||||
"""
|
||||
CategorizedCorpusReader.__init__(self, kwargs)
|
||||
TaggedCorpusReader.__init__(self, *args, **kwargs)
|
||||
|
||||
def _resolve(self, fileids, categories):
|
||||
if fileids is not None and categories is not None:
|
||||
raise ValueError('Specify fileids or categories, not both')
|
||||
if categories is not None:
|
||||
return self.fileids(categories)
|
||||
else:
|
||||
return fileids
|
||||
|
||||
def raw(self, fileids=None, categories=None):
|
||||
return TaggedCorpusReader.raw(self, self._resolve(fileids, categories))
|
||||
|
||||
def words(self, fileids=None, categories=None):
|
||||
return TaggedCorpusReader.words(self, self._resolve(fileids, categories))
|
||||
|
||||
def sents(self, fileids=None, categories=None):
|
||||
return TaggedCorpusReader.sents(self, self._resolve(fileids, categories))
|
||||
|
||||
def paras(self, fileids=None, categories=None):
|
||||
return TaggedCorpusReader.paras(self, self._resolve(fileids, categories))
|
||||
|
||||
def tagged_words(self, fileids=None, categories=None, tagset=None):
|
||||
return TaggedCorpusReader.tagged_words(
|
||||
self, self._resolve(fileids, categories), tagset
|
||||
)
|
||||
|
||||
def tagged_sents(self, fileids=None, categories=None, tagset=None):
|
||||
return TaggedCorpusReader.tagged_sents(
|
||||
self, self._resolve(fileids, categories), tagset
|
||||
)
|
||||
|
||||
def tagged_paras(self, fileids=None, categories=None, tagset=None):
|
||||
return TaggedCorpusReader.tagged_paras(
|
||||
self, self._resolve(fileids, categories), tagset
|
||||
)
|
||||
|
||||
|
||||
class TaggedCorpusView(StreamBackedCorpusView):
|
||||
"""
|
||||
A specialized corpus view for tagged documents. It can be
|
||||
customized via flags to divide the tagged corpus documents up by
|
||||
sentence or paragraph, and to include or omit part of speech tags.
|
||||
``TaggedCorpusView`` objects are typically created by
|
||||
``TaggedCorpusReader`` (not directly by nltk users).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
corpus_file,
|
||||
encoding,
|
||||
tagged,
|
||||
group_by_sent,
|
||||
group_by_para,
|
||||
sep,
|
||||
word_tokenizer,
|
||||
sent_tokenizer,
|
||||
para_block_reader,
|
||||
tag_mapping_function=None,
|
||||
):
|
||||
self._tagged = tagged
|
||||
self._group_by_sent = group_by_sent
|
||||
self._group_by_para = group_by_para
|
||||
self._sep = sep
|
||||
self._word_tokenizer = word_tokenizer
|
||||
self._sent_tokenizer = sent_tokenizer
|
||||
self._para_block_reader = para_block_reader
|
||||
self._tag_mapping_function = tag_mapping_function
|
||||
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
||||
|
||||
def read_block(self, stream):
|
||||
"""Reads one paragraph at a time."""
|
||||
block = []
|
||||
for para_str in self._para_block_reader(stream):
|
||||
para = []
|
||||
for sent_str in self._sent_tokenizer.tokenize(para_str):
|
||||
sent = [
|
||||
str2tuple(s, self._sep)
|
||||
for s in self._word_tokenizer.tokenize(sent_str)
|
||||
]
|
||||
if self._tag_mapping_function:
|
||||
sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
|
||||
if not self._tagged:
|
||||
sent = [w for (w, t) in sent]
|
||||
if self._group_by_sent:
|
||||
para.append(sent)
|
||||
else:
|
||||
para.extend(sent)
|
||||
if self._group_by_para:
|
||||
block.append(para)
|
||||
else:
|
||||
block.extend(para)
|
||||
return block
|
||||
|
||||
|
||||
# needs to implement simplified tags
|
||||
class MacMorphoCorpusReader(TaggedCorpusReader):
|
||||
"""
|
||||
A corpus reader for the MAC_MORPHO corpus. Each line contains a
|
||||
single tagged word, using '_' as a separator. Sentence boundaries
|
||||
are based on the end-sentence tag ('_.'). Paragraph information
|
||||
is not included in the corpus, so each paragraph returned by
|
||||
``self.paras()`` and ``self.tagged_paras()`` contains a single
|
||||
sentence.
|
||||
"""
|
||||
|
||||
def __init__(self, root, fileids, encoding='utf8', tagset=None):
|
||||
TaggedCorpusReader.__init__(
|
||||
self,
|
||||
root,
|
||||
fileids,
|
||||
sep='_',
|
||||
word_tokenizer=LineTokenizer(),
|
||||
sent_tokenizer=RegexpTokenizer('.*\n'),
|
||||
para_block_reader=self._read_block,
|
||||
encoding=encoding,
|
||||
tagset=tagset,
|
||||
)
|
||||
|
||||
def _read_block(self, stream):
|
||||
return read_regexp_block(stream, r'.*', r'.*_\.')
|
||||
|
||||
|
||||
class TimitTaggedCorpusReader(TaggedCorpusReader):
|
||||
"""
|
||||
A corpus reader for tagged sentences that are included in the TIMIT corpus.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
TaggedCorpusReader.__init__(
|
||||
self, para_block_reader=read_timit_block, *args, **kwargs
|
||||
)
|
||||
|
||||
def paras(self):
|
||||
raise NotImplementedError('use sents() instead')
|
||||
|
||||
def tagged_paras(self):
|
||||
raise NotImplementedError('use tagged_sents() instead')
|
||||
499
venv/lib/python3.7/site-packages/nltk/corpus/reader/timit.py
Normal file
499
venv/lib/python3.7/site-packages/nltk/corpus/reader/timit.py
Normal file
@@ -0,0 +1,499 @@
|
||||
# Natural Language Toolkit: TIMIT Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2007 NLTK Project
|
||||
# Author: Haejoong Lee <haejoong@ldc.upenn.edu>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Jacob Perkins <japerk@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
# [xx] this docstring is out-of-date:
|
||||
"""
|
||||
Read tokens, phonemes and audio data from the NLTK TIMIT Corpus.
|
||||
|
||||
This corpus contains selected portion of the TIMIT corpus.
|
||||
|
||||
- 16 speakers from 8 dialect regions
|
||||
- 1 male and 1 female from each dialect region
|
||||
- total 130 sentences (10 sentences per speaker. Note that some
|
||||
sentences are shared among other speakers, especially sa1 and sa2
|
||||
are spoken by all speakers.)
|
||||
- total 160 recording of sentences (10 recordings per speaker)
|
||||
- audio format: NIST Sphere, single channel, 16kHz sampling,
|
||||
16 bit sample, PCM encoding
|
||||
|
||||
|
||||
Module contents
|
||||
===============
|
||||
|
||||
The timit corpus reader provides 4 functions and 4 data items.
|
||||
|
||||
- utterances
|
||||
|
||||
List of utterances in the corpus. There are total 160 utterances,
|
||||
each of which corresponds to a unique utterance of a speaker.
|
||||
Here's an example of an utterance identifier in the list::
|
||||
|
||||
dr1-fvmh0/sx206
|
||||
- _---- _---
|
||||
| | | | |
|
||||
| | | | |
|
||||
| | | | `--- sentence number
|
||||
| | | `----- sentence type (a:all, i:shared, x:exclusive)
|
||||
| | `--------- speaker ID
|
||||
| `------------ sex (m:male, f:female)
|
||||
`-------------- dialect region (1..8)
|
||||
|
||||
- speakers
|
||||
|
||||
List of speaker IDs. An example of speaker ID::
|
||||
|
||||
dr1-fvmh0
|
||||
|
||||
Note that if you split an item ID with colon and take the first element of
|
||||
the result, you will get a speaker ID.
|
||||
|
||||
>>> itemid = 'dr1-fvmh0/sx206'
|
||||
>>> spkrid , sentid = itemid.split('/')
|
||||
>>> spkrid
|
||||
'dr1-fvmh0'
|
||||
|
||||
The second element of the result is a sentence ID.
|
||||
|
||||
- dictionary()
|
||||
|
||||
Phonetic dictionary of words contained in this corpus. This is a Python
|
||||
dictionary from words to phoneme lists.
|
||||
|
||||
- spkrinfo()
|
||||
|
||||
Speaker information table. It's a Python dictionary from speaker IDs to
|
||||
records of 10 fields. Speaker IDs the same as the ones in timie.speakers.
|
||||
Each record is a dictionary from field names to values, and the fields are
|
||||
as follows::
|
||||
|
||||
id speaker ID as defined in the original TIMIT speaker info table
|
||||
sex speaker gender (M:male, F:female)
|
||||
dr speaker dialect region (1:new england, 2:northern,
|
||||
3:north midland, 4:south midland, 5:southern, 6:new york city,
|
||||
7:western, 8:army brat (moved around))
|
||||
use corpus type (TRN:training, TST:test)
|
||||
in this sample corpus only TRN is available
|
||||
recdate recording date
|
||||
birthdate speaker birth date
|
||||
ht speaker height
|
||||
race speaker race (WHT:white, BLK:black, AMR:american indian,
|
||||
SPN:spanish-american, ORN:oriental,???:unknown)
|
||||
edu speaker education level (HS:high school, AS:associate degree,
|
||||
BS:bachelor's degree (BS or BA), MS:master's degree (MS or MA),
|
||||
PHD:doctorate degree (PhD,JD,MD), ??:unknown)
|
||||
comments comments by the recorder
|
||||
|
||||
The 4 functions are as follows.
|
||||
|
||||
- tokenized(sentences=items, offset=False)
|
||||
|
||||
Given a list of items, returns an iterator of a list of word lists,
|
||||
each of which corresponds to an item (sentence). If offset is set to True,
|
||||
each element of the word list is a tuple of word(string), start offset and
|
||||
end offset, where offset is represented as a number of 16kHz samples.
|
||||
|
||||
- phonetic(sentences=items, offset=False)
|
||||
|
||||
Given a list of items, returns an iterator of a list of phoneme lists,
|
||||
each of which corresponds to an item (sentence). If offset is set to True,
|
||||
each element of the phoneme list is a tuple of word(string), start offset
|
||||
and end offset, where offset is represented as a number of 16kHz samples.
|
||||
|
||||
- audiodata(item, start=0, end=None)
|
||||
|
||||
Given an item, returns a chunk of audio samples formatted into a string.
|
||||
When the fuction is called, if start and end are omitted, the entire
|
||||
samples of the recording will be returned. If only end is omitted,
|
||||
samples from the start offset to the end of the recording will be returned.
|
||||
|
||||
- play(data)
|
||||
|
||||
Play the given audio samples. The audio samples can be obtained from the
|
||||
timit.audiodata function.
|
||||
|
||||
"""
|
||||
from __future__ import print_function, unicode_literals
|
||||
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk import compat
|
||||
from nltk.tree import Tree
|
||||
from nltk.internals import import_from_stdlib
|
||||
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
|
||||
class TimitCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for the TIMIT corpus (or any other corpus with the same
|
||||
file layout and use of file formats). The corpus root directory
|
||||
should contain the following files:
|
||||
|
||||
- timitdic.txt: dictionary of standard transcriptions
|
||||
- spkrinfo.txt: table of speaker information
|
||||
|
||||
In addition, the root directory should contain one subdirectory
|
||||
for each speaker, containing three files for each utterance:
|
||||
|
||||
- <utterance-id>.txt: text content of utterances
|
||||
- <utterance-id>.wrd: tokenized text content of utterances
|
||||
- <utterance-id>.phn: phonetic transcription of utterances
|
||||
- <utterance-id>.wav: utterance sound file
|
||||
"""
|
||||
|
||||
_FILE_RE = r'(\w+-\w+/\w+\.(phn|txt|wav|wrd))|' + r'timitdic\.txt|spkrinfo\.txt'
|
||||
"""A regexp matching fileids that are used by this corpus reader."""
|
||||
_UTTERANCE_RE = r'\w+-\w+/\w+\.txt'
|
||||
|
||||
def __init__(self, root, encoding='utf8'):
|
||||
"""
|
||||
Construct a new TIMIT corpus reader in the given directory.
|
||||
:param root: The root directory for this corpus.
|
||||
"""
|
||||
# Ensure that wave files don't get treated as unicode data:
|
||||
if isinstance(encoding, string_types):
|
||||
encoding = [('.*\.wav', None), ('.*', encoding)]
|
||||
|
||||
CorpusReader.__init__(
|
||||
self, root, find_corpus_fileids(root, self._FILE_RE), encoding=encoding
|
||||
)
|
||||
|
||||
self._utterances = [
|
||||
name[:-4] for name in find_corpus_fileids(root, self._UTTERANCE_RE)
|
||||
]
|
||||
"""A list of the utterance identifiers for all utterances in
|
||||
this corpus."""
|
||||
|
||||
self._speakerinfo = None
|
||||
self._root = root
|
||||
self.speakers = sorted(set(u.split('/')[0] for u in self._utterances))
|
||||
|
||||
def fileids(self, filetype=None):
|
||||
"""
|
||||
Return a list of file identifiers for the files that make up
|
||||
this corpus.
|
||||
|
||||
:param filetype: If specified, then ``filetype`` indicates that
|
||||
only the files that have the given type should be
|
||||
returned. Accepted values are: ``txt``, ``wrd``, ``phn``,
|
||||
``wav``, or ``metadata``,
|
||||
"""
|
||||
if filetype is None:
|
||||
return CorpusReader.fileids(self)
|
||||
elif filetype in ('txt', 'wrd', 'phn', 'wav'):
|
||||
return ['%s.%s' % (u, filetype) for u in self._utterances]
|
||||
elif filetype == 'metadata':
|
||||
return ['timitdic.txt', 'spkrinfo.txt']
|
||||
else:
|
||||
raise ValueError('Bad value for filetype: %r' % filetype)
|
||||
|
||||
def utteranceids(
|
||||
self, dialect=None, sex=None, spkrid=None, sent_type=None, sentid=None
|
||||
):
|
||||
"""
|
||||
:return: A list of the utterance identifiers for all
|
||||
utterances in this corpus, or for the given speaker, dialect
|
||||
region, gender, sentence type, or sentence number, if
|
||||
specified.
|
||||
"""
|
||||
if isinstance(dialect, string_types):
|
||||
dialect = [dialect]
|
||||
if isinstance(sex, string_types):
|
||||
sex = [sex]
|
||||
if isinstance(spkrid, string_types):
|
||||
spkrid = [spkrid]
|
||||
if isinstance(sent_type, string_types):
|
||||
sent_type = [sent_type]
|
||||
if isinstance(sentid, string_types):
|
||||
sentid = [sentid]
|
||||
|
||||
utterances = self._utterances[:]
|
||||
if dialect is not None:
|
||||
utterances = [u for u in utterances if u[2] in dialect]
|
||||
if sex is not None:
|
||||
utterances = [u for u in utterances if u[4] in sex]
|
||||
if spkrid is not None:
|
||||
utterances = [u for u in utterances if u[:9] in spkrid]
|
||||
if sent_type is not None:
|
||||
utterances = [u for u in utterances if u[11] in sent_type]
|
||||
if sentid is not None:
|
||||
utterances = [u for u in utterances if u[10:] in spkrid]
|
||||
return utterances
|
||||
|
||||
def transcription_dict(self):
|
||||
"""
|
||||
:return: A dictionary giving the 'standard' transcription for
|
||||
each word.
|
||||
"""
|
||||
_transcriptions = {}
|
||||
for line in self.open('timitdic.txt'):
|
||||
if not line.strip() or line[0] == ';':
|
||||
continue
|
||||
m = re.match(r'\s*(\S+)\s+/(.*)/\s*$', line)
|
||||
if not m:
|
||||
raise ValueError('Bad line: %r' % line)
|
||||
_transcriptions[m.group(1)] = m.group(2).split()
|
||||
return _transcriptions
|
||||
|
||||
def spkrid(self, utterance):
|
||||
return utterance.split('/')[0]
|
||||
|
||||
def sentid(self, utterance):
|
||||
return utterance.split('/')[1]
|
||||
|
||||
def utterance(self, spkrid, sentid):
|
||||
return '%s/%s' % (spkrid, sentid)
|
||||
|
||||
def spkrutteranceids(self, speaker):
|
||||
"""
|
||||
:return: A list of all utterances associated with a given
|
||||
speaker.
|
||||
"""
|
||||
return [
|
||||
utterance
|
||||
for utterance in self._utterances
|
||||
if utterance.startswith(speaker + '/')
|
||||
]
|
||||
|
||||
def spkrinfo(self, speaker):
|
||||
"""
|
||||
:return: A dictionary mapping .. something.
|
||||
"""
|
||||
if speaker in self._utterances:
|
||||
speaker = self.spkrid(speaker)
|
||||
|
||||
if self._speakerinfo is None:
|
||||
self._speakerinfo = {}
|
||||
for line in self.open('spkrinfo.txt'):
|
||||
if not line.strip() or line[0] == ';':
|
||||
continue
|
||||
rec = line.strip().split(None, 9)
|
||||
key = "dr%s-%s%s" % (rec[2], rec[1].lower(), rec[0].lower())
|
||||
self._speakerinfo[key] = SpeakerInfo(*rec)
|
||||
|
||||
return self._speakerinfo[speaker]
|
||||
|
||||
def phones(self, utterances=None):
|
||||
return [
|
||||
line.split()[-1]
|
||||
for fileid in self._utterance_fileids(utterances, '.phn')
|
||||
for line in self.open(fileid)
|
||||
if line.strip()
|
||||
]
|
||||
|
||||
def phone_times(self, utterances=None):
|
||||
"""
|
||||
offset is represented as a number of 16kHz samples!
|
||||
"""
|
||||
return [
|
||||
(line.split()[2], int(line.split()[0]), int(line.split()[1]))
|
||||
for fileid in self._utterance_fileids(utterances, '.phn')
|
||||
for line in self.open(fileid)
|
||||
if line.strip()
|
||||
]
|
||||
|
||||
def words(self, utterances=None):
|
||||
return [
|
||||
line.split()[-1]
|
||||
for fileid in self._utterance_fileids(utterances, '.wrd')
|
||||
for line in self.open(fileid)
|
||||
if line.strip()
|
||||
]
|
||||
|
||||
def word_times(self, utterances=None):
|
||||
return [
|
||||
(line.split()[2], int(line.split()[0]), int(line.split()[1]))
|
||||
for fileid in self._utterance_fileids(utterances, '.wrd')
|
||||
for line in self.open(fileid)
|
||||
if line.strip()
|
||||
]
|
||||
|
||||
def sents(self, utterances=None):
|
||||
return [
|
||||
[line.split()[-1] for line in self.open(fileid) if line.strip()]
|
||||
for fileid in self._utterance_fileids(utterances, '.wrd')
|
||||
]
|
||||
|
||||
def sent_times(self, utterances=None):
|
||||
return [
|
||||
(
|
||||
line.split(None, 2)[-1].strip(),
|
||||
int(line.split()[0]),
|
||||
int(line.split()[1]),
|
||||
)
|
||||
for fileid in self._utterance_fileids(utterances, '.txt')
|
||||
for line in self.open(fileid)
|
||||
if line.strip()
|
||||
]
|
||||
|
||||
def phone_trees(self, utterances=None):
|
||||
if utterances is None:
|
||||
utterances = self._utterances
|
||||
if isinstance(utterances, string_types):
|
||||
utterances = [utterances]
|
||||
|
||||
trees = []
|
||||
for utterance in utterances:
|
||||
word_times = self.word_times(utterance)
|
||||
phone_times = self.phone_times(utterance)
|
||||
sent_times = self.sent_times(utterance)
|
||||
|
||||
while sent_times:
|
||||
(sent, sent_start, sent_end) = sent_times.pop(0)
|
||||
trees.append(Tree('S', []))
|
||||
while (
|
||||
word_times and phone_times and phone_times[0][2] <= word_times[0][1]
|
||||
):
|
||||
trees[-1].append(phone_times.pop(0)[0])
|
||||
while word_times and word_times[0][2] <= sent_end:
|
||||
(word, word_start, word_end) = word_times.pop(0)
|
||||
trees[-1].append(Tree(word, []))
|
||||
while phone_times and phone_times[0][2] <= word_end:
|
||||
trees[-1][-1].append(phone_times.pop(0)[0])
|
||||
while phone_times and phone_times[0][2] <= sent_end:
|
||||
trees[-1].append(phone_times.pop(0)[0])
|
||||
return trees
|
||||
|
||||
# [xx] NOTE: This is currently broken -- we're assuming that the
|
||||
# fileids are WAV fileids (aka RIFF), but they're actually NIST SPHERE
|
||||
# fileids.
|
||||
def wav(self, utterance, start=0, end=None):
|
||||
# nltk.chunk conflicts with the stdlib module 'chunk'
|
||||
wave = import_from_stdlib('wave')
|
||||
|
||||
w = wave.open(self.open(utterance + '.wav'), 'rb')
|
||||
|
||||
if end is None:
|
||||
end = w.getnframes()
|
||||
|
||||
# Skip past frames before start, then read the frames we want
|
||||
w.readframes(start)
|
||||
frames = w.readframes(end - start)
|
||||
|
||||
# Open a new temporary file -- the wave module requires
|
||||
# an actual file, and won't work w/ stringio. :(
|
||||
tf = tempfile.TemporaryFile()
|
||||
out = wave.open(tf, 'w')
|
||||
|
||||
# Write the parameters & data to the new file.
|
||||
out.setparams(w.getparams())
|
||||
out.writeframes(frames)
|
||||
out.close()
|
||||
|
||||
# Read the data back from the file, and return it. The
|
||||
# file will automatically be deleted when we return.
|
||||
tf.seek(0)
|
||||
return tf.read()
|
||||
|
||||
def audiodata(self, utterance, start=0, end=None):
|
||||
assert end is None or end > start
|
||||
headersize = 44
|
||||
if end is None:
|
||||
data = self.open(utterance + '.wav').read()
|
||||
else:
|
||||
data = self.open(utterance + '.wav').read(headersize + end * 2)
|
||||
return data[headersize + start * 2 :]
|
||||
|
||||
def _utterance_fileids(self, utterances, extension):
|
||||
if utterances is None:
|
||||
utterances = self._utterances
|
||||
if isinstance(utterances, string_types):
|
||||
utterances = [utterances]
|
||||
return ['%s%s' % (u, extension) for u in utterances]
|
||||
|
||||
def play(self, utterance, start=0, end=None):
|
||||
"""
|
||||
Play the given audio sample.
|
||||
|
||||
:param utterance: The utterance id of the sample to play
|
||||
"""
|
||||
# Method 1: os audio dev.
|
||||
try:
|
||||
import ossaudiodev
|
||||
|
||||
try:
|
||||
dsp = ossaudiodev.open('w')
|
||||
dsp.setfmt(ossaudiodev.AFMT_S16_LE)
|
||||
dsp.channels(1)
|
||||
dsp.speed(16000)
|
||||
dsp.write(self.audiodata(utterance, start, end))
|
||||
dsp.close()
|
||||
except IOError as e:
|
||||
print(
|
||||
(
|
||||
"can't acquire the audio device; please "
|
||||
"activate your audio device."
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
print("system error message:", str(e), file=sys.stderr)
|
||||
return
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Method 2: pygame
|
||||
try:
|
||||
# FIXME: this won't work under python 3
|
||||
import pygame.mixer, StringIO
|
||||
|
||||
pygame.mixer.init(16000)
|
||||
f = StringIO.StringIO(self.wav(utterance, start, end))
|
||||
pygame.mixer.Sound(f).play()
|
||||
while pygame.mixer.get_busy():
|
||||
time.sleep(0.01)
|
||||
return
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Method 3: complain. :)
|
||||
print(
|
||||
("you must install pygame or ossaudiodev " "for audio playback."),
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
|
||||
@compat.python_2_unicode_compatible
|
||||
class SpeakerInfo(object):
|
||||
def __init__(
|
||||
self, id, sex, dr, use, recdate, birthdate, ht, race, edu, comments=None
|
||||
):
|
||||
self.id = id
|
||||
self.sex = sex
|
||||
self.dr = dr
|
||||
self.use = use
|
||||
self.recdate = recdate
|
||||
self.birthdate = birthdate
|
||||
self.ht = ht
|
||||
self.race = race
|
||||
self.edu = edu
|
||||
self.comments = comments
|
||||
|
||||
def __repr__(self):
|
||||
attribs = 'id sex dr use recdate birthdate ht race edu comments'
|
||||
args = ['%s=%r' % (attr, getattr(self, attr)) for attr in attribs.split()]
|
||||
return 'SpeakerInfo(%s)' % (', '.join(args))
|
||||
|
||||
|
||||
def read_timit_block(stream):
|
||||
"""
|
||||
Block reader for timit tagged sentences, which are preceded by a sentence
|
||||
number that will be ignored.
|
||||
"""
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
return []
|
||||
n, sent = line.split(' ', 1)
|
||||
return [sent]
|
||||
@@ -0,0 +1,83 @@
|
||||
# Natural Language Toolkit: Toolbox Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Greg Aumann <greg_aumann@sil.org>
|
||||
# Stuart Robinson <Stuart.Robinson@mpi.nl>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Module for reading, writing and manipulating
|
||||
Toolbox databases and settings fileids.
|
||||
"""
|
||||
|
||||
from nltk.toolbox import ToolboxData
|
||||
from nltk.corpus.reader.util import *
|
||||
from nltk.corpus.reader.api import *
|
||||
|
||||
|
||||
class ToolboxCorpusReader(CorpusReader):
|
||||
def xml(self, fileids, key=None):
|
||||
return concat(
|
||||
[
|
||||
ToolboxData(path, enc).parse(key=key)
|
||||
for (path, enc) in self.abspaths(fileids, True)
|
||||
]
|
||||
)
|
||||
|
||||
def fields(
|
||||
self,
|
||||
fileids,
|
||||
strip=True,
|
||||
unwrap=True,
|
||||
encoding='utf8',
|
||||
errors='strict',
|
||||
unicode_fields=None,
|
||||
):
|
||||
return concat(
|
||||
[
|
||||
list(
|
||||
ToolboxData(fileid, enc).fields(
|
||||
strip, unwrap, encoding, errors, unicode_fields
|
||||
)
|
||||
)
|
||||
for (fileid, enc) in self.abspaths(fileids, include_encoding=True)
|
||||
]
|
||||
)
|
||||
|
||||
# should probably be done lazily:
|
||||
def entries(self, fileids, **kwargs):
|
||||
if 'key' in kwargs:
|
||||
key = kwargs['key']
|
||||
del kwargs['key']
|
||||
else:
|
||||
key = 'lx' # the default key in MDF
|
||||
entries = []
|
||||
for marker, contents in self.fields(fileids, **kwargs):
|
||||
if marker == key:
|
||||
entries.append((contents, []))
|
||||
else:
|
||||
try:
|
||||
entries[-1][-1].append((marker, contents))
|
||||
except IndexError:
|
||||
pass
|
||||
return entries
|
||||
|
||||
def words(self, fileids, key='lx'):
|
||||
return [contents for marker, contents in self.fields(fileids) if marker == key]
|
||||
|
||||
def raw(self, fileids):
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
|
||||
def demo():
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
demo()
|
||||
153
venv/lib/python3.7/site-packages/nltk/corpus/reader/twitter.py
Normal file
153
venv/lib/python3.7/site-packages/nltk/corpus/reader/twitter.py
Normal file
@@ -0,0 +1,153 @@
|
||||
# Natural Language Toolkit: Twitter Corpus Reader
|
||||
#
|
||||
# Copyright (C) 2001-2019 NLTK Project
|
||||
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||||
# URL: <http://nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A reader for corpora that consist of Tweets. It is assumed that the Tweets
|
||||
have been serialised into line-delimited JSON.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
from six import string_types
|
||||
|
||||
from nltk.tokenize import TweetTokenizer
|
||||
|
||||
from nltk.corpus.reader.util import StreamBackedCorpusView, concat, ZipFilePathPointer
|
||||
from nltk.corpus.reader.api import CorpusReader
|
||||
|
||||
|
||||
class TwitterCorpusReader(CorpusReader):
|
||||
"""
|
||||
Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.
|
||||
|
||||
Individual Tweets can be tokenized using the default tokenizer, or by a
|
||||
custom tokenizer specified as a parameter to the constructor.
|
||||
|
||||
Construct a new Tweet corpus reader for a set of documents
|
||||
located at the given root directory.
|
||||
|
||||
If you made your own tweet collection in a directory called
|
||||
`twitter-files`, then you can initialise the reader as::
|
||||
|
||||
from nltk.corpus import TwitterCorpusReader
|
||||
reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')
|
||||
|
||||
However, the recommended approach is to set the relevant directory as the
|
||||
value of the environmental variable `TWITTER`, and then invoke the reader
|
||||
as follows::
|
||||
|
||||
root = os.environ['TWITTER']
|
||||
reader = TwitterCorpusReader(root, '.*\.json')
|
||||
|
||||
If you want to work directly with the raw Tweets, the `json` library can
|
||||
be used::
|
||||
|
||||
import json
|
||||
for tweet in reader.docs():
|
||||
print(json.dumps(tweet, indent=1, sort_keys=True))
|
||||
|
||||
"""
|
||||
|
||||
CorpusView = StreamBackedCorpusView
|
||||
"""
|
||||
The corpus view class used by this reader.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding='utf8'
|
||||
):
|
||||
"""
|
||||
|
||||
:param root: The root directory for this corpus.
|
||||
|
||||
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||||
|
||||
:param word_tokenizer: Tokenizer for breaking the text of Tweets into
|
||||
smaller units, including but not limited to words.
|
||||
|
||||
"""
|
||||
CorpusReader.__init__(self, root, fileids, encoding)
|
||||
|
||||
for path in self.abspaths(self._fileids):
|
||||
if isinstance(path, ZipFilePathPointer):
|
||||
pass
|
||||
elif os.path.getsize(path) == 0:
|
||||
raise ValueError("File {} is empty".format(path))
|
||||
"""Check that all user-created corpus files are non-empty."""
|
||||
|
||||
self._word_tokenizer = word_tokenizer
|
||||
|
||||
def docs(self, fileids=None):
|
||||
"""
|
||||
Returns the full Tweet objects, as specified by `Twitter
|
||||
documentation on Tweets
|
||||
<https://dev.twitter.com/docs/platform-objects/tweets>`_
|
||||
|
||||
:return: the given file(s) as a list of dictionaries deserialised
|
||||
from JSON.
|
||||
:rtype: list(dict)
|
||||
"""
|
||||
return concat(
|
||||
[
|
||||
self.CorpusView(path, self._read_tweets, encoding=enc)
|
||||
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
||||
]
|
||||
)
|
||||
|
||||
def strings(self, fileids=None):
|
||||
"""
|
||||
Returns only the text content of Tweets in the file(s)
|
||||
|
||||
:return: the given file(s) as a list of Tweets.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
fulltweets = self.docs(fileids)
|
||||
tweets = []
|
||||
for jsono in fulltweets:
|
||||
try:
|
||||
text = jsono['text']
|
||||
if isinstance(text, bytes):
|
||||
text = text.decode(self.encoding)
|
||||
tweets.append(text)
|
||||
except KeyError:
|
||||
pass
|
||||
return tweets
|
||||
|
||||
def tokenized(self, fileids=None):
|
||||
"""
|
||||
:return: the given file(s) as a list of the text content of Tweets as
|
||||
as a list of words, screenanames, hashtags, URLs and punctuation symbols.
|
||||
|
||||
:rtype: list(list(str))
|
||||
"""
|
||||
tweets = self.strings(fileids)
|
||||
tokenizer = self._word_tokenizer
|
||||
return [tokenizer.tokenize(t) for t in tweets]
|
||||
|
||||
def raw(self, fileids=None):
|
||||
"""
|
||||
Return the corpora in their raw form.
|
||||
"""
|
||||
if fileids is None:
|
||||
fileids = self._fileids
|
||||
elif isinstance(fileids, string_types):
|
||||
fileids = [fileids]
|
||||
return concat([self.open(f).read() for f in fileids])
|
||||
|
||||
def _read_tweets(self, stream):
|
||||
"""
|
||||
Assumes that each line in ``stream`` is a JSON-serialised object.
|
||||
"""
|
||||
tweets = []
|
||||
for i in range(10):
|
||||
line = stream.readline()
|
||||
if not line:
|
||||
return tweets
|
||||
tweet = json.loads(line)
|
||||
tweets.append(tweet)
|
||||
return tweets
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user