Initial commit

This commit is contained in:
Senad Uka
2019-10-20 13:16:49 +02:00
commit 233066caf4
2099 changed files with 360824 additions and 0 deletions

View File

@@ -0,0 +1,493 @@
# Natural Language Toolkit: Corpus Readers
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# TODO this docstring isn't up-to-date!
"""
NLTK corpus readers. The modules in this package provide functions
that can be used to read corpus files in a variety of formats. These
functions can be used to read both the corpus files that are
distributed in the NLTK corpus package, and corpus files that are part
of external corpora.
Available Corpora
=================
Please see http://www.nltk.org/nltk_data/ for a complete list.
Install corpora using nltk.download().
Corpus Reader Functions
=======================
Each corpus module defines one or more "corpus reader functions",
which can be used to read documents from that corpus. These functions
take an argument, ``item``, which is used to indicate which document
should be read from the corpus:
- If ``item`` is one of the unique identifiers listed in the corpus
module's ``items`` variable, then the corresponding document will
be loaded from the NLTK corpus package.
- If ``item`` is a filename, then that file will be read.
Additionally, corpus reader functions can be given lists of item
names; in which case, they will return a concatenation of the
corresponding documents.
Corpus reader functions are named based on the type of information
they return. Some common examples, and their return types, are:
- words(): list of str
- sents(): list of (list of str)
- paras(): list of (list of (list of str))
- tagged_words(): list of (str,str) tuple
- tagged_sents(): list of (list of (str,str))
- tagged_paras(): list of (list of (list of (str,str)))
- chunked_sents(): list of (Tree w/ (str,str) leaves)
- parsed_sents(): list of (Tree with str leaves)
- parsed_paras(): list of (list of (Tree with str leaves))
- xml(): A single xml ElementTree
- raw(): unprocessed corpus contents
For example, to read a list of the words in the Brown Corpus, use
``nltk.corpus.brown.words()``:
>>> from nltk.corpus import brown
>>> print(", ".join(brown.words()))
The, Fulton, County, Grand, Jury, said, ...
"""
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import *
abc = LazyCorpusLoader(
'abc',
PlaintextCorpusReader,
r'(?!\.).*\.txt',
encoding=[('science', 'latin_1'), ('rural', 'utf8')],
)
alpino = LazyCorpusLoader('alpino', AlpinoCorpusReader, tagset='alpino')
brown = LazyCorpusLoader(
'brown',
CategorizedTaggedCorpusReader,
r'c[a-z]\d\d',
cat_file='cats.txt',
tagset='brown',
encoding="ascii",
)
cess_cat = LazyCorpusLoader(
'cess_cat',
BracketParseCorpusReader,
r'(?!\.).*\.tbf',
tagset='unknown',
encoding='ISO-8859-15',
)
cess_esp = LazyCorpusLoader(
'cess_esp',
BracketParseCorpusReader,
r'(?!\.).*\.tbf',
tagset='unknown',
encoding='ISO-8859-15',
)
cmudict = LazyCorpusLoader('cmudict', CMUDictCorpusReader, ['cmudict'])
comtrans = LazyCorpusLoader('comtrans', AlignedCorpusReader, r'(?!\.).*\.txt')
comparative_sentences = LazyCorpusLoader(
'comparative_sentences',
ComparativeSentencesCorpusReader,
r'labeledSentences\.txt',
encoding='latin-1',
)
conll2000 = LazyCorpusLoader(
'conll2000',
ConllChunkCorpusReader,
['train.txt', 'test.txt'],
('NP', 'VP', 'PP'),
tagset='wsj',
encoding='ascii',
)
conll2002 = LazyCorpusLoader(
'conll2002',
ConllChunkCorpusReader,
'.*\.(test|train).*',
('LOC', 'PER', 'ORG', 'MISC'),
encoding='utf-8',
)
conll2007 = LazyCorpusLoader(
'conll2007',
DependencyCorpusReader,
'.*\.(test|train).*',
encoding=[('eus', 'ISO-8859-2'), ('esp', 'utf8')],
)
crubadan = LazyCorpusLoader('crubadan', CrubadanCorpusReader, '.*\.txt')
dependency_treebank = LazyCorpusLoader(
'dependency_treebank', DependencyCorpusReader, '.*\.dp', encoding='ascii'
)
floresta = LazyCorpusLoader(
'floresta',
BracketParseCorpusReader,
r'(?!\.).*\.ptb',
'#',
tagset='unknown',
encoding='ISO-8859-15',
)
framenet15 = LazyCorpusLoader(
'framenet_v15',
FramenetCorpusReader,
[
'frRelation.xml',
'frameIndex.xml',
'fulltextIndex.xml',
'luIndex.xml',
'semTypes.xml',
],
)
framenet = LazyCorpusLoader(
'framenet_v17',
FramenetCorpusReader,
[
'frRelation.xml',
'frameIndex.xml',
'fulltextIndex.xml',
'luIndex.xml',
'semTypes.xml',
],
)
gazetteers = LazyCorpusLoader(
'gazetteers', WordListCorpusReader, r'(?!LICENSE|\.).*\.txt', encoding='ISO-8859-2'
)
genesis = LazyCorpusLoader(
'genesis',
PlaintextCorpusReader,
r'(?!\.).*\.txt',
encoding=[
('finnish|french|german', 'latin_1'),
('swedish', 'cp865'),
('.*', 'utf_8'),
],
)
gutenberg = LazyCorpusLoader(
'gutenberg', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1'
)
ieer = LazyCorpusLoader('ieer', IEERCorpusReader, r'(?!README|\.).*')
inaugural = LazyCorpusLoader(
'inaugural', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1'
)
# [XX] This should probably just use TaggedCorpusReader:
indian = LazyCorpusLoader(
'indian', IndianCorpusReader, r'(?!\.).*\.pos', tagset='unknown', encoding='utf8'
)
jeita = LazyCorpusLoader('jeita', ChasenCorpusReader, r'.*\.chasen', encoding='utf-8')
knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
lin_thesaurus = LazyCorpusLoader('lin_thesaurus', LinThesaurusCorpusReader, r'.*\.lsp')
mac_morpho = LazyCorpusLoader(
'mac_morpho',
MacMorphoCorpusReader,
r'(?!\.).*\.txt',
tagset='unknown',
encoding='latin-1',
)
machado = LazyCorpusLoader(
'machado',
PortugueseCategorizedPlaintextCorpusReader,
r'(?!\.).*\.txt',
cat_pattern=r'([a-z]*)/.*',
encoding='latin-1',
)
masc_tagged = LazyCorpusLoader(
'masc_tagged',
CategorizedTaggedCorpusReader,
r'(spoken|written)/.*\.txt',
cat_file='categories.txt',
tagset='wsj',
encoding="utf-8",
sep="_",
)
movie_reviews = LazyCorpusLoader(
'movie_reviews',
CategorizedPlaintextCorpusReader,
r'(?!\.).*\.txt',
cat_pattern=r'(neg|pos)/.*',
encoding='ascii',
)
multext_east = LazyCorpusLoader(
'mte_teip5', MTECorpusReader, r'(oana).*\.xml', encoding="utf-8"
)
names = LazyCorpusLoader(
'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii'
)
nps_chat = LazyCorpusLoader(
'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml', tagset='wsj'
)
opinion_lexicon = LazyCorpusLoader(
'opinion_lexicon',
OpinionLexiconCorpusReader,
r'(\w+)\-words\.txt',
encoding='ISO-8859-2',
)
ppattach = LazyCorpusLoader(
'ppattach', PPAttachmentCorpusReader, ['training', 'test', 'devset']
)
product_reviews_1 = LazyCorpusLoader(
'product_reviews_1', ReviewsCorpusReader, r'^(?!Readme).*\.txt', encoding='utf8'
)
product_reviews_2 = LazyCorpusLoader(
'product_reviews_2', ReviewsCorpusReader, r'^(?!Readme).*\.txt', encoding='utf8'
)
pros_cons = LazyCorpusLoader(
'pros_cons',
ProsConsCorpusReader,
r'Integrated(Cons|Pros)\.txt',
cat_pattern=r'Integrated(Cons|Pros)\.txt',
encoding='ISO-8859-2',
)
ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
'ptb',
CategorizedBracketParseCorpusReader,
r'(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG',
cat_file='allcats.txt',
tagset='wsj',
)
qc = LazyCorpusLoader(
'qc', StringCategoryCorpusReader, ['train.txt', 'test.txt'], encoding='ISO-8859-2'
)
reuters = LazyCorpusLoader(
'reuters',
CategorizedPlaintextCorpusReader,
'(training|test).*',
cat_file='cats.txt',
encoding='ISO-8859-2',
)
rte = LazyCorpusLoader('rte', RTECorpusReader, r'(?!\.).*\.xml')
senseval = LazyCorpusLoader('senseval', SensevalCorpusReader, r'(?!\.).*\.pos')
sentence_polarity = LazyCorpusLoader(
'sentence_polarity',
CategorizedSentencesCorpusReader,
r'rt-polarity\.(neg|pos)',
cat_pattern=r'rt-polarity\.(neg|pos)',
encoding='utf-8',
)
sentiwordnet = LazyCorpusLoader(
'sentiwordnet', SentiWordNetCorpusReader, 'SentiWordNet_3.0.0.txt', encoding='utf-8'
)
shakespeare = LazyCorpusLoader('shakespeare', XMLCorpusReader, r'(?!\.).*\.xml')
sinica_treebank = LazyCorpusLoader(
'sinica_treebank',
SinicaTreebankCorpusReader,
['parsed'],
tagset='unknown',
encoding='utf-8',
)
state_union = LazyCorpusLoader(
'state_union', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='ISO-8859-2'
)
stopwords = LazyCorpusLoader(
'stopwords', WordListCorpusReader, r'(?!README|\.).*', encoding='utf8'
)
subjectivity = LazyCorpusLoader(
'subjectivity',
CategorizedSentencesCorpusReader,
r'(quote.tok.gt9|plot.tok.gt9)\.5000',
cat_map={'quote.tok.gt9.5000': ['subj'], 'plot.tok.gt9.5000': ['obj']},
encoding='latin-1',
)
swadesh = LazyCorpusLoader(
'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8'
)
swadesh110 = LazyCorpusLoader(
'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8'
)
swadesh207 = LazyCorpusLoader(
'panlex_swadesh', PanlexSwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8'
)
switchboard = LazyCorpusLoader('switchboard', SwitchboardCorpusReader, tagset='wsj')
timit = LazyCorpusLoader('timit', TimitCorpusReader)
timit_tagged = LazyCorpusLoader(
'timit', TimitTaggedCorpusReader, '.+\.tags', tagset='wsj', encoding='ascii'
)
toolbox = LazyCorpusLoader(
'toolbox', ToolboxCorpusReader, r'(?!.*(README|\.)).*\.(dic|txt)'
)
treebank = LazyCorpusLoader(
'treebank/combined',
BracketParseCorpusReader,
r'wsj_.*\.mrg',
tagset='wsj',
encoding='ascii',
)
treebank_chunk = LazyCorpusLoader(
'treebank/tagged',
ChunkedCorpusReader,
r'wsj_.*\.pos',
sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True),
para_block_reader=tagged_treebank_para_block_reader,
tagset='wsj',
encoding='ascii',
)
treebank_raw = LazyCorpusLoader(
'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2'
)
twitter_samples = LazyCorpusLoader('twitter_samples', TwitterCorpusReader, '.*\.json')
udhr = LazyCorpusLoader('udhr', UdhrCorpusReader)
udhr2 = LazyCorpusLoader('udhr2', PlaintextCorpusReader, r'.*\.txt', encoding='utf8')
universal_treebanks = LazyCorpusLoader(
'universal_treebanks_v20',
ConllCorpusReader,
r'.*\.conll',
columntypes=(
'ignore',
'words',
'ignore',
'ignore',
'pos',
'ignore',
'ignore',
'ignore',
'ignore',
'ignore',
),
)
verbnet = LazyCorpusLoader('verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml')
webtext = LazyCorpusLoader(
'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2'
)
wordnet = LazyCorpusLoader(
'wordnet',
WordNetCorpusReader,
LazyCorpusLoader('omw', CorpusReader, r'.*/wn-data-.*\.tab', encoding='utf8'),
)
wordnet_ic = LazyCorpusLoader('wordnet_ic', WordNetICCorpusReader, '.*\.dat')
words = LazyCorpusLoader(
'words', WordListCorpusReader, r'(?!README|\.).*', encoding='ascii'
)
# defined after treebank
propbank = LazyCorpusLoader(
'propbank',
PropbankCorpusReader,
'prop.txt',
'frames/.*\.xml',
'verbs.txt',
lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
treebank,
) # Must be defined *after* treebank corpus.
nombank = LazyCorpusLoader(
'nombank.1.0',
NombankCorpusReader,
'nombank.1.0',
'frames/.*\.xml',
'nombank.1.0.words',
lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
treebank,
) # Must be defined *after* treebank corpus.
propbank_ptb = LazyCorpusLoader(
'propbank',
PropbankCorpusReader,
'prop.txt',
'frames/.*\.xml',
'verbs.txt',
lambda filename: filename.upper(),
ptb,
) # Must be defined *after* ptb corpus.
nombank_ptb = LazyCorpusLoader(
'nombank.1.0',
NombankCorpusReader,
'nombank.1.0',
'frames/.*\.xml',
'nombank.1.0.words',
lambda filename: filename.upper(),
ptb,
) # Must be defined *after* ptb corpus.
semcor = LazyCorpusLoader(
'semcor', SemcorCorpusReader, r'brown./tagfiles/br-.*\.xml', wordnet
) # Must be defined *after* wordnet corpus.
nonbreaking_prefixes = LazyCorpusLoader(
'nonbreaking_prefixes',
NonbreakingPrefixesCorpusReader,
r'(?!README|\.).*',
encoding='utf8',
)
perluniprops = LazyCorpusLoader(
'perluniprops',
UnicharsCorpusReader,
r'(?!README|\.).*',
nltk_data_subdir='misc',
encoding='utf8',
)
# mwa_ppdb = LazyCorpusLoader(
# 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
# See https://github.com/nltk/nltk/issues/1579
# and https://github.com/nltk/nltk/issues/1716
#
# pl196x = LazyCorpusLoader(
# 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
# cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
#
# ipipan = LazyCorpusLoader(
# 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
#
# nkjp = LazyCorpusLoader(
# 'nkjp', NKJPCorpusReader, r'', encoding='utf8')
#
# panlex_lite = LazyCorpusLoader(
# 'panlex_lite', PanLexLiteCorpusReader)
#
# ycoe = LazyCorpusLoader(
# 'ycoe', YCOECorpusReader)
#
# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
# hebrew_treebank = LazyCorpusLoader(
# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
# FIXME: override any imported demo from various corpora, see https://github.com/nltk/nltk/issues/2116
def demo():
# This is out-of-date:
abc.demo()
brown.demo()
# chat80.demo()
cmudict.demo()
conll2000.demo()
conll2002.demo()
genesis.demo()
gutenberg.demo()
ieer.demo()
inaugural.demo()
indian.demo()
names.demo()
ppattach.demo()
senseval.demo()
shakespeare.demo()
sinica_treebank.demo()
state_union.demo()
stopwords.demo()
timit.demo()
toolbox.demo()
treebank.demo()
udhr.demo()
webtext.demo()
words.demo()
# ycoe.demo()
if __name__ == '__main__':
# demo()
pass
# ** this is for nose **
# unload all corpus after tests
def teardown_module(module=None):
import nltk.corpus
for name in dir(nltk.corpus):
obj = getattr(nltk.corpus, name, None)
if isinstance(obj, CorpusReader) and hasattr(obj, '_unload'):
obj._unload()

View File

@@ -0,0 +1,55 @@
# Natural Language Toolkit: Europarl Corpus Readers
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Nitin Madnani <nmadnani@umiacs.umd.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import re
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import *
# Create a new corpus reader instance for each European language
danish = LazyCorpusLoader(
'europarl_raw/danish', EuroparlCorpusReader, r'ep-.*\.da', encoding='utf-8'
)
dutch = LazyCorpusLoader(
'europarl_raw/dutch', EuroparlCorpusReader, r'ep-.*\.nl', encoding='utf-8'
)
english = LazyCorpusLoader(
'europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8'
)
finnish = LazyCorpusLoader(
'europarl_raw/finnish', EuroparlCorpusReader, r'ep-.*\.fi', encoding='utf-8'
)
french = LazyCorpusLoader(
'europarl_raw/french', EuroparlCorpusReader, r'ep-.*\.fr', encoding='utf-8'
)
german = LazyCorpusLoader(
'europarl_raw/german', EuroparlCorpusReader, r'ep-.*\.de', encoding='utf-8'
)
greek = LazyCorpusLoader(
'europarl_raw/greek', EuroparlCorpusReader, r'ep-.*\.el', encoding='utf-8'
)
italian = LazyCorpusLoader(
'europarl_raw/italian', EuroparlCorpusReader, r'ep-.*\.it', encoding='utf-8'
)
portuguese = LazyCorpusLoader(
'europarl_raw/portuguese', EuroparlCorpusReader, r'ep-.*\.pt', encoding='utf-8'
)
spanish = LazyCorpusLoader(
'europarl_raw/spanish', EuroparlCorpusReader, r'ep-.*\.es', encoding='utf-8'
)
swedish = LazyCorpusLoader(
'europarl_raw/swedish', EuroparlCorpusReader, r'ep-.*\.sv', encoding='utf-8'
)

View File

@@ -0,0 +1,183 @@
# Natural Language Toolkit: Corpus Readers
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
NLTK corpus readers. The modules in this package provide functions
that can be used to read corpus fileids in a variety of formats. These
functions can be used to read both the corpus fileids that are
distributed in the NLTK corpus package, and corpus fileids that are part
of external corpora.
Corpus Reader Functions
=======================
Each corpus module defines one or more "corpus reader functions",
which can be used to read documents from that corpus. These functions
take an argument, ``item``, which is used to indicate which document
should be read from the corpus:
- If ``item`` is one of the unique identifiers listed in the corpus
module's ``items`` variable, then the corresponding document will
be loaded from the NLTK corpus package.
- If ``item`` is a fileid, then that file will be read.
Additionally, corpus reader functions can be given lists of item
names; in which case, they will return a concatenation of the
corresponding documents.
Corpus reader functions are named based on the type of information
they return. Some common examples, and their return types, are:
- words(): list of str
- sents(): list of (list of str)
- paras(): list of (list of (list of str))
- tagged_words(): list of (str,str) tuple
- tagged_sents(): list of (list of (str,str))
- tagged_paras(): list of (list of (list of (str,str)))
- chunked_sents(): list of (Tree w/ (str,str) leaves)
- parsed_sents(): list of (Tree with str leaves)
- parsed_paras(): list of (list of (Tree with str leaves))
- xml(): A single xml ElementTree
- raw(): unprocessed corpus contents
For example, to read a list of the words in the Brown Corpus, use
``nltk.corpus.brown.words()``:
>>> from nltk.corpus import brown
>>> print(", ".join(brown.words()))
The, Fulton, County, Grand, Jury, said, ...
"""
from nltk.corpus.reader.plaintext import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
from nltk.corpus.reader.tagged import *
from nltk.corpus.reader.cmudict import *
from nltk.corpus.reader.conll import *
from nltk.corpus.reader.chunked import *
from nltk.corpus.reader.wordlist import *
from nltk.corpus.reader.xmldocs import *
from nltk.corpus.reader.ppattach import *
from nltk.corpus.reader.senseval import *
from nltk.corpus.reader.ieer import *
from nltk.corpus.reader.sinica_treebank import *
from nltk.corpus.reader.bracket_parse import *
from nltk.corpus.reader.indian import *
from nltk.corpus.reader.toolbox import *
from nltk.corpus.reader.timit import *
from nltk.corpus.reader.ycoe import *
from nltk.corpus.reader.rte import *
from nltk.corpus.reader.string_category import *
from nltk.corpus.reader.propbank import *
from nltk.corpus.reader.verbnet import *
from nltk.corpus.reader.bnc import *
from nltk.corpus.reader.nps_chat import *
from nltk.corpus.reader.wordnet import *
from nltk.corpus.reader.switchboard import *
from nltk.corpus.reader.dependency import *
from nltk.corpus.reader.nombank import *
from nltk.corpus.reader.ipipan import *
from nltk.corpus.reader.pl196x import *
from nltk.corpus.reader.knbc import *
from nltk.corpus.reader.chasen import *
from nltk.corpus.reader.childes import *
from nltk.corpus.reader.aligned import *
from nltk.corpus.reader.lin import *
from nltk.corpus.reader.semcor import *
from nltk.corpus.reader.framenet import *
from nltk.corpus.reader.udhr import *
from nltk.corpus.reader.bnc import *
from nltk.corpus.reader.sentiwordnet import *
from nltk.corpus.reader.twitter import *
from nltk.corpus.reader.nkjp import *
from nltk.corpus.reader.crubadan import *
from nltk.corpus.reader.mte import *
from nltk.corpus.reader.reviews import *
from nltk.corpus.reader.opinion_lexicon import *
from nltk.corpus.reader.pros_cons import *
from nltk.corpus.reader.categorized_sents import *
from nltk.corpus.reader.comparative_sents import *
from nltk.corpus.reader.panlex_lite import *
from nltk.corpus.reader.panlex_swadesh import *
# Make sure that nltk.corpus.reader.bracket_parse gives the module, not
# the function bracket_parse() defined in nltk.tree:
from nltk.corpus.reader import bracket_parse
__all__ = [
'CorpusReader',
'CategorizedCorpusReader',
'PlaintextCorpusReader',
'find_corpus_fileids',
'TaggedCorpusReader',
'CMUDictCorpusReader',
'ConllChunkCorpusReader',
'WordListCorpusReader',
'PPAttachmentCorpusReader',
'SensevalCorpusReader',
'IEERCorpusReader',
'ChunkedCorpusReader',
'SinicaTreebankCorpusReader',
'BracketParseCorpusReader',
'IndianCorpusReader',
'ToolboxCorpusReader',
'TimitCorpusReader',
'YCOECorpusReader',
'MacMorphoCorpusReader',
'SyntaxCorpusReader',
'AlpinoCorpusReader',
'RTECorpusReader',
'StringCategoryCorpusReader',
'EuroparlCorpusReader',
'CategorizedBracketParseCorpusReader',
'CategorizedTaggedCorpusReader',
'CategorizedPlaintextCorpusReader',
'PortugueseCategorizedPlaintextCorpusReader',
'tagged_treebank_para_block_reader',
'PropbankCorpusReader',
'VerbnetCorpusReader',
'BNCCorpusReader',
'ConllCorpusReader',
'XMLCorpusReader',
'NPSChatCorpusReader',
'SwadeshCorpusReader',
'WordNetCorpusReader',
'WordNetICCorpusReader',
'SwitchboardCorpusReader',
'DependencyCorpusReader',
'NombankCorpusReader',
'IPIPANCorpusReader',
'Pl196xCorpusReader',
'TEICorpusView',
'KNBCorpusReader',
'ChasenCorpusReader',
'CHILDESCorpusReader',
'AlignedCorpusReader',
'TimitTaggedCorpusReader',
'LinThesaurusCorpusReader',
'SemcorCorpusReader',
'FramenetCorpusReader',
'UdhrCorpusReader',
'BNCCorpusReader',
'SentiWordNetCorpusReader',
'SentiSynset',
'TwitterCorpusReader',
'NKJPCorpusReader',
'CrubadanCorpusReader',
'MTECorpusReader',
'ReviewsCorpusReader',
'OpinionLexiconCorpusReader',
'ProsConsCorpusReader',
'CategorizedSentencesCorpusReader',
'ComparativeSentencesCorpusReader',
'PanLexLiteCorpusReader',
'NonbreakingPrefixesCorpusReader',
'UnicharsCorpusReader',
'MWAPPDBCorpusReader',
'PanlexSwadeshCorpusReader',
]

View File

@@ -0,0 +1,168 @@
# Natural Language Toolkit: Aligned Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# URL: <http://nltk.org/>
# Author: Steven Bird <stevenbird1@gmail.com>
# For license information, see LICENSE.TXT
from six import string_types
from nltk.tokenize import WhitespaceTokenizer, RegexpTokenizer
from nltk.translate import AlignedSent, Alignment
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import (
StreamBackedCorpusView,
concat,
read_alignedsent_block,
)
class AlignedCorpusReader(CorpusReader):
"""
Reader for corpora of word-aligned sentences. Tokens are assumed
to be separated by whitespace. Sentences begin on separate lines.
"""
def __init__(
self,
root,
fileids,
sep='/',
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding='latin1',
):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
def raw(self, fileids=None):
"""
:return: the given file(s) as a single string.
:rtype: str
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
AlignedSentCorpusView(
fileid,
enc,
False,
False,
self._word_tokenizer,
self._sent_tokenizer,
self._alignedsent_block_reader,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
"""
return concat(
[
AlignedSentCorpusView(
fileid,
enc,
False,
True,
self._word_tokenizer,
self._sent_tokenizer,
self._alignedsent_block_reader,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def aligned_sents(self, fileids=None):
"""
:return: the given file(s) as a list of AlignedSent objects.
:rtype: list(AlignedSent)
"""
return concat(
[
AlignedSentCorpusView(
fileid,
enc,
True,
True,
self._word_tokenizer,
self._sent_tokenizer,
self._alignedsent_block_reader,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
class AlignedSentCorpusView(StreamBackedCorpusView):
"""
A specialized corpus view for aligned sentences.
``AlignedSentCorpusView`` objects are typically created by
``AlignedCorpusReader`` (not directly by nltk users).
"""
def __init__(
self,
corpus_file,
encoding,
aligned,
group_by_sent,
word_tokenizer,
sent_tokenizer,
alignedsent_block_reader,
):
self._aligned = aligned
self._group_by_sent = group_by_sent
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def read_block(self, stream):
block = [
self._word_tokenizer.tokenize(sent_str)
for alignedsent_str in self._alignedsent_block_reader(stream)
for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)
]
if self._aligned:
block[2] = Alignment.fromstring(
" ".join(block[2])
) # kludge; we shouldn't have tokenized the alignment string
block = [AlignedSent(*block)]
elif self._group_by_sent:
block = [block[0]]
else:
block = block[0]
return block

View File

@@ -0,0 +1,484 @@
# Natural Language Toolkit: API for Corpus Readers
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
API for corpus readers.
"""
from __future__ import unicode_literals
import os
import re
from collections import defaultdict
from itertools import chain
from six import string_types
from nltk import compat
from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer
from nltk.corpus.reader.util import *
@compat.python_2_unicode_compatible
class CorpusReader(object):
"""
A base class for "corpus reader" classes, each of which can be
used to read a specific corpus format. Each individual corpus
reader instance is used to read a specific corpus, consisting of
one or more files under a common root directory. Each file is
identified by its ``file identifier``, which is the relative path
to the file from the root directory.
A separate subclass is defined for each corpus format. These
subclasses define one or more methods that provide 'views' on the
corpus contents, such as ``words()`` (for a list of words) and
``parsed_sents()`` (for a list of parsed sentences). Called with
no arguments, these methods will return the contents of the entire
corpus. For most corpora, these methods define one or more
selection arguments, such as ``fileids`` or ``categories``, which can
be used to select which portion of the corpus should be returned.
"""
def __init__(self, root, fileids, encoding='utf8', tagset=None):
"""
:type root: PathPointer or str
:param root: A path pointer identifying the root directory for
this corpus. If a string is specified, then it will be
converted to a ``PathPointer`` automatically.
:param fileids: A list of the files that make up this corpus.
This list can either be specified explicitly, as a list of
strings; or implicitly, as a regular expression over file
paths. The absolute path for each file will be constructed
by joining the reader's root to each file name.
:param encoding: The default unicode encoding for the files
that make up the corpus. The value of ``encoding`` can be any
of the following:
- A string: ``encoding`` is the encoding name for all files.
- A dictionary: ``encoding[file_id]`` is the encoding
name for the file whose identifier is ``file_id``. If
``file_id`` is not in ``encoding``, then the file
contents will be processed using non-unicode byte strings.
- A list: ``encoding`` should be a list of ``(regexp, encoding)``
tuples. The encoding for a file whose identifier is ``file_id``
will be the ``encoding`` value for the first tuple whose
``regexp`` matches the ``file_id``. If no tuple's ``regexp``
matches the ``file_id``, the file contents will be processed
using non-unicode byte strings.
- None: the file contents of all files will be
processed using non-unicode byte strings.
:param tagset: The name of the tagset used by this corpus, to be used
for normalizing or converting the POS tags returned by the
tagged_...() methods.
"""
# Convert the root to a path pointer, if necessary.
if isinstance(root, string_types) and not isinstance(root, PathPointer):
m = re.match('(.*\.zip)/?(.*)$|', root)
zipfile, zipentry = m.groups()
if zipfile:
root = ZipFilePathPointer(zipfile, zipentry)
else:
root = FileSystemPathPointer(root)
elif not isinstance(root, PathPointer):
raise TypeError('CorpusReader: expected a string or a PathPointer')
# If `fileids` is a regexp, then expand it.
if isinstance(fileids, string_types):
fileids = find_corpus_fileids(root, fileids)
self._fileids = fileids
"""A list of the relative paths for the fileids that make up
this corpus."""
self._root = root
"""The root directory for this corpus."""
# If encoding was specified as a list of regexps, then convert
# it to a dictionary.
if isinstance(encoding, list):
encoding_dict = {}
for fileid in self._fileids:
for x in encoding:
(regexp, enc) = x
if re.match(regexp, fileid):
encoding_dict[fileid] = enc
break
encoding = encoding_dict
self._encoding = encoding
"""The default unicode encoding for the fileids that make up
this corpus. If ``encoding`` is None, then the file
contents are processed using byte strings."""
self._tagset = tagset
def __repr__(self):
if isinstance(self._root, ZipFilePathPointer):
path = '%s/%s' % (self._root.zipfile.filename, self._root.entry)
else:
path = '%s' % self._root.path
return '<%s in %r>' % (self.__class__.__name__, path)
def ensure_loaded(self):
"""
Load this corpus (if it has not already been loaded). This is
used by LazyCorpusLoader as a simple method that can be used to
make sure a corpus is loaded -- e.g., in case a user wants to
do help(some_corpus).
"""
pass # no need to actually do anything.
def readme(self):
"""
Return the contents of the corpus README file, if it exists.
"""
return self.open("README").read()
def license(self):
"""
Return the contents of the corpus LICENSE file, if it exists.
"""
return self.open("LICENSE").read()
def citation(self):
"""
Return the contents of the corpus citation.bib file, if it exists.
"""
return self.open("citation.bib").read()
def fileids(self):
"""
Return a list of file identifiers for the fileids that make up
this corpus.
"""
return self._fileids
def abspath(self, fileid):
"""
Return the absolute path for the given file.
:type fileid: str
:param fileid: The file identifier for the file whose path
should be returned.
:rtype: PathPointer
"""
return self._root.join(fileid)
def abspaths(self, fileids=None, include_encoding=False, include_fileid=False):
"""
Return a list of the absolute paths for all fileids in this corpus;
or for the given list of fileids, if specified.
:type fileids: None or str or list
:param fileids: Specifies the set of fileids for which paths should
be returned. Can be None, for all fileids; a list of
file identifiers, for a specified set of fileids; or a single
file identifier, for a single file. Note that the return
value is always a list of paths, even if ``fileids`` is a
single file identifier.
:param include_encoding: If true, then return a list of
``(path_pointer, encoding)`` tuples.
:rtype: list(PathPointer)
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
paths = [self._root.join(f) for f in fileids]
if include_encoding and include_fileid:
return list(zip(paths, [self.encoding(f) for f in fileids], fileids))
elif include_fileid:
return list(zip(paths, fileids))
elif include_encoding:
return list(zip(paths, [self.encoding(f) for f in fileids]))
else:
return paths
def open(self, file):
"""
Return an open stream that can be used to read the given file.
If the file's encoding is not None, then the stream will
automatically decode the file's contents into unicode.
:param file: The file identifier of the file to read.
"""
encoding = self.encoding(file)
stream = self._root.join(file).open(encoding)
return stream
def encoding(self, file):
"""
Return the unicode encoding for the given corpus file, if known.
If the encoding is unknown, or if the given file should be
processed using byte strings (str), then return None.
"""
if isinstance(self._encoding, dict):
return self._encoding.get(file)
else:
return self._encoding
def _get_root(self):
return self._root
root = property(
_get_root,
doc="""
The directory where this corpus is stored.
:type: PathPointer""",
)
######################################################################
# { Corpora containing categorized items
######################################################################
class CategorizedCorpusReader(object):
"""
A mixin class used to aid in the implementation of corpus readers
for categorized corpora. This class defines the method
``categories()``, which returns a list of the categories for the
corpus or for a specified set of fileids; and overrides ``fileids()``
to take a ``categories`` argument, restricting the set of fileids to
be returned.
Subclasses are expected to:
- Call ``__init__()`` to set up the mapping.
- Override all view methods to accept a ``categories`` parameter,
which can be used *instead* of the ``fileids`` parameter, to
select which fileids should be included in the returned view.
"""
def __init__(self, kwargs):
"""
Initialize this mapping based on keyword arguments, as
follows:
- cat_pattern: A regular expression pattern used to find the
category for each file identifier. The pattern will be
applied to each file identifier, and the first matching
group will be used as the category label for that file.
- cat_map: A dictionary, mapping from file identifiers to
category labels.
- cat_file: The name of a file that contains the mapping
from file identifiers to categories. The argument
``cat_delimiter`` can be used to specify a delimiter.
The corresponding argument will be deleted from ``kwargs``. If
more than one argument is specified, an exception will be
raised.
"""
self._f2c = None #: file-to-category mapping
self._c2f = None #: category-to-file mapping
self._pattern = None #: regexp specifying the mapping
self._map = None #: dict specifying the mapping
self._file = None #: fileid of file containing the mapping
self._delimiter = None #: delimiter for ``self._file``
if 'cat_pattern' in kwargs:
self._pattern = kwargs['cat_pattern']
del kwargs['cat_pattern']
elif 'cat_map' in kwargs:
self._map = kwargs['cat_map']
del kwargs['cat_map']
elif 'cat_file' in kwargs:
self._file = kwargs['cat_file']
del kwargs['cat_file']
if 'cat_delimiter' in kwargs:
self._delimiter = kwargs['cat_delimiter']
del kwargs['cat_delimiter']
else:
raise ValueError(
'Expected keyword argument cat_pattern or ' 'cat_map or cat_file.'
)
if 'cat_pattern' in kwargs or 'cat_map' in kwargs or 'cat_file' in kwargs:
raise ValueError(
'Specify exactly one of: cat_pattern, ' 'cat_map, cat_file.'
)
def _init(self):
self._f2c = defaultdict(set)
self._c2f = defaultdict(set)
if self._pattern is not None:
for file_id in self._fileids:
category = re.match(self._pattern, file_id).group(1)
self._add(file_id, category)
elif self._map is not None:
for (file_id, categories) in self._map.items():
for category in categories:
self._add(file_id, category)
elif self._file is not None:
for line in self.open(self._file).readlines():
line = line.strip()
file_id, categories = line.split(self._delimiter, 1)
if file_id not in self.fileids():
raise ValueError(
'In category mapping file %s: %s '
'not found' % (self._file, file_id)
)
for category in categories.split(self._delimiter):
self._add(file_id, category)
def _add(self, file_id, category):
self._f2c[file_id].add(category)
self._c2f[category].add(file_id)
def categories(self, fileids=None):
"""
Return a list of the categories that are defined for this corpus,
or for the file(s) if it is given.
"""
if self._f2c is None:
self._init()
if fileids is None:
return sorted(self._c2f)
if isinstance(fileids, string_types):
fileids = [fileids]
return sorted(set.union(*[self._f2c[d] for d in fileids]))
def fileids(self, categories=None):
"""
Return a list of file identifiers for the files that make up
this corpus, or that make up the given category(s) if specified.
"""
if categories is None:
return super(CategorizedCorpusReader, self).fileids()
elif isinstance(categories, string_types):
if self._f2c is None:
self._init()
if categories in self._c2f:
return sorted(self._c2f[categories])
else:
raise ValueError('Category %s not found' % categories)
else:
if self._f2c is None:
self._init()
return sorted(set.union(*[self._c2f[c] for c in categories]))
######################################################################
# { Treebank readers
######################################################################
# [xx] is it worth it to factor this out?
class SyntaxCorpusReader(CorpusReader):
"""
An abstract base class for reading corpora consisting of
syntactically parsed text. Subclasses should define:
- ``__init__``, which specifies the location of the corpus
and a method for detecting the sentence blocks in corpus files.
- ``_read_block``, which reads a block from the input stream.
- ``_word``, which takes a block and returns a list of list of words.
- ``_tag``, which takes a block and returns a list of list of tagged
words.
- ``_parse``, which takes a block and returns a list of parsed
sentences.
"""
def _parse(self, s):
raise NotImplementedError()
def _word(self, s):
raise NotImplementedError()
def _tag(self, s):
raise NotImplementedError()
def _read_block(self, stream):
raise NotImplementedError()
def raw(self, fileids=None):
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def parsed_sents(self, fileids=None):
reader = self._read_parsed_sent_block
return concat(
[
StreamBackedCorpusView(fileid, reader, encoding=enc)
for fileid, enc in self.abspaths(fileids, True)
]
)
def tagged_sents(self, fileids=None, tagset=None):
def reader(stream):
return self._read_tagged_sent_block(stream, tagset)
return concat(
[
StreamBackedCorpusView(fileid, reader, encoding=enc)
for fileid, enc in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
reader = self._read_sent_block
return concat(
[
StreamBackedCorpusView(fileid, reader, encoding=enc)
for fileid, enc in self.abspaths(fileids, True)
]
)
def tagged_words(self, fileids=None, tagset=None):
def reader(stream):
return self._read_tagged_word_block(stream, tagset)
return concat(
[
StreamBackedCorpusView(fileid, reader, encoding=enc)
for fileid, enc in self.abspaths(fileids, True)
]
)
def words(self, fileids=None):
return concat(
[
StreamBackedCorpusView(fileid, self._read_word_block, encoding=enc)
for fileid, enc in self.abspaths(fileids, True)
]
)
# ------------------------------------------------------------
# { Block Readers
def _read_word_block(self, stream):
return list(chain(*self._read_sent_block(stream)))
def _read_tagged_word_block(self, stream, tagset=None):
return list(chain(*self._read_tagged_sent_block(stream, tagset)))
def _read_sent_block(self, stream):
return list(filter(None, [self._word(t) for t in self._read_block(stream)]))
def _read_tagged_sent_block(self, stream, tagset=None):
return list(
filter(None, [self._tag(t, tagset) for t in self._read_block(stream)])
)
def _read_parsed_sent_block(self, stream):
return list(filter(None, [self._parse(t) for t in self._read_block(stream)]))
# } End of Block Readers
# ------------------------------------------------------------

View File

@@ -0,0 +1,258 @@
# Natural Language Toolkit: Plaintext Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""Corpus reader for the XML version of the British National Corpus."""
from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView, ElementTree
class BNCCorpusReader(XMLCorpusReader):
"""Corpus reader for the XML version of the British National Corpus.
For access to the complete XML data structure, use the ``xml()``
method. For access to simple word lists and tagged word lists, use
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
You can obtain the full version of the BNC corpus at
http://www.ota.ox.ac.uk/desc/2554
If you extracted the archive to a directory called `BNC`, then you can
instantiate the reader as::
BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')
"""
def __init__(self, root, fileids, lazy=True):
XMLCorpusReader.__init__(self, root, fileids)
self._lazy = lazy
def words(self, fileids=None, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
return self._views(fileids, False, None, strip_space, stem)
def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of tagged
words and punctuation symbols, encoded as tuples
``(word,tag)``.
:rtype: list(tuple(str,str))
:param c5: If true, then the tags used will be the more detailed
c5 tags. Otherwise, the simplified tags will be used.
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
tag = 'c5' if c5 else 'pos'
return self._views(fileids, False, tag, strip_space, stem)
def sents(self, fileids=None, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
return self._views(fileids, True, None, strip_space, stem)
def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False):
"""
:return: the given file(s) as a list of
sentences, each encoded as a list of ``(word,tag)`` tuples.
:rtype: list(list(tuple(str,str)))
:param c5: If true, then the tags used will be the more detailed
c5 tags. Otherwise, the simplified tags will be used.
:param strip_space: If true, then strip trailing spaces from
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
tag = 'c5' if c5 else 'pos'
return self._views(
fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
)
def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False):
"""A helper function that instantiates BNCWordViews or the list of words/sentences."""
f = BNCWordView if self._lazy else self._words
return concat(
[
f(fileid, sent, tag, strip_space, stem)
for fileid in self.abspaths(fileids)
]
)
def _words(self, fileid, bracket_sent, tag, strip_space, stem):
"""
Helper used to implement the view methods -- returns a list of
words or a list of sentences, optionally tagged.
:param fileid: The name of the underlying file.
:param bracket_sent: If true, include sentence bracketing.
:param tag: The name of the tagset to use, or None for no tags.
:param strip_space: If true, strip spaces from word tokens.
:param stem: If true, then substitute stems for words.
"""
result = []
xmldoc = ElementTree.parse(fileid).getroot()
for xmlsent in xmldoc.findall('.//s'):
sent = []
for xmlword in _all_xmlwords_in(xmlsent):
word = xmlword.text
if not word:
word = "" # fixes issue 337?
if strip_space or stem:
word = word.strip()
if stem:
word = xmlword.get('hw', word)
if tag == 'c5':
word = (word, xmlword.get('c5'))
elif tag == 'pos':
word = (word, xmlword.get('pos', xmlword.get('c5')))
sent.append(word)
if bracket_sent:
result.append(BNCSentence(xmlsent.attrib['n'], sent))
else:
result.extend(sent)
assert None not in result
return result
def _all_xmlwords_in(elt, result=None):
if result is None:
result = []
for child in elt:
if child.tag in ('c', 'w'):
result.append(child)
else:
_all_xmlwords_in(child, result)
return result
class BNCSentence(list):
"""
A list of words, augmented by an attribute ``num`` used to record
the sentence identifier (the ``n`` attribute from the XML).
"""
def __init__(self, num, items):
self.num = num
list.__init__(self, items)
class BNCWordView(XMLCorpusView):
"""
A stream backed corpus view specialized for use with the BNC corpus.
"""
tags_to_ignore = set(
['pb', 'gap', 'vocal', 'event', 'unclear', 'shift', 'pause', 'align']
)
"""These tags are ignored. For their description refer to the
technical documentation, for example,
http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html
"""
def __init__(self, fileid, sent, tag, strip_space, stem):
"""
:param fileid: The name of the underlying file.
:param sent: If true, include sentence bracketing.
:param tag: The name of the tagset to use, or None for no tags.
:param strip_space: If true, strip spaces from word tokens.
:param stem: If true, then substitute stems for words.
"""
if sent:
tagspec = '.*/s'
else:
tagspec = '.*/s/(.*/)?(c|w)'
self._sent = sent
self._tag = tag
self._strip_space = strip_space
self._stem = stem
self.title = None #: Title of the document.
self.author = None #: Author of the document.
self.editor = None #: Editor
self.resps = None #: Statement of responsibility
XMLCorpusView.__init__(self, fileid, tagspec)
# Read in a tasty header.
self._open()
self.read_block(self._stream, '.*/teiHeader$', self.handle_header)
self.close()
# Reset tag context.
self._tag_context = {0: ()}
def handle_header(self, elt, context):
# Set up some metadata!
titles = elt.findall('titleStmt/title')
if titles:
self.title = '\n'.join(title.text.strip() for title in titles)
authors = elt.findall('titleStmt/author')
if authors:
self.author = '\n'.join(author.text.strip() for author in authors)
editors = elt.findall('titleStmt/editor')
if editors:
self.editor = '\n'.join(editor.text.strip() for editor in editors)
resps = elt.findall('titleStmt/respStmt')
if resps:
self.resps = '\n\n'.join(
'\n'.join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
)
def handle_elt(self, elt, context):
if self._sent:
return self.handle_sent(elt)
else:
return self.handle_word(elt)
def handle_word(self, elt):
word = elt.text
if not word:
word = "" # fixes issue 337?
if self._strip_space or self._stem:
word = word.strip()
if self._stem:
word = elt.get('hw', word)
if self._tag == 'c5':
word = (word, elt.get('c5'))
elif self._tag == 'pos':
word = (word, elt.get('pos', elt.get('c5')))
return word
def handle_sent(self, elt):
sent = []
for child in elt:
if child.tag in ('mw', 'hi', 'corr', 'trunc'):
sent += [self.handle_word(w) for w in child]
elif child.tag in ('w', 'c'):
sent.append(self.handle_word(child))
elif child.tag not in self.tags_to_ignore:
raise ValueError('Unexpected element %s' % child.tag)
return BNCSentence(elt.attrib['n'], sent)

View File

@@ -0,0 +1,271 @@
# Natural Language Toolkit: Penn Treebank Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for corpora that consist of parenthesis-delineated parse trees.
"""
import sys
from nltk.tree import Tree
from nltk.tag import map_tag
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
# we use [^\s()]+ instead of \S+? to avoid matching ()
SORTTAGWRD = re.compile(r'\((\d+) ([^\s()]+) ([^\s()]+)\)')
TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)')
WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)')
EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(')
class BracketParseCorpusReader(SyntaxCorpusReader):
"""
Reader for corpora that consist of parenthesis-delineated parse trees,
like those found in the "combined" section of the Penn Treebank,
e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".
"""
def __init__(
self,
root,
fileids,
comment_char=None,
detect_blocks='unindented_paren',
encoding='utf8',
tagset=None,
):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param comment_char: The character which can appear at the start of
a line to indicate that the rest of the line is a comment.
:param detect_blocks: The method that is used to find blocks
in the corpus; can be 'unindented_paren' (every unindented
parenthesis starts a new parse) or 'sexpr' (brackets are
matched).
:param tagset: The name of the tagset used by this corpus, to be used
for normalizing or converting the POS tags returned by the
tagged_...() methods.
"""
# FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
# from CorpusReader?
CorpusReader.__init__(self, root, fileids, encoding)
self._comment_char = comment_char
self._detect_blocks = detect_blocks
self._tagset = tagset
def _read_block(self, stream):
if self._detect_blocks == 'sexpr':
return read_sexpr_block(stream, comment_char=self._comment_char)
elif self._detect_blocks == 'blankline':
return read_blankline_block(stream)
elif self._detect_blocks == 'unindented_paren':
# Tokens start with unindented left parens.
toks = read_regexp_block(stream, start_re=r'^\(')
# Strip any comments out of the tokens.
if self._comment_char:
toks = [
re.sub('(?m)^%s.*' % re.escape(self._comment_char), '', tok)
for tok in toks
]
return toks
else:
assert 0, 'bad block type'
def _normalize(self, t):
# If there's an empty set of brackets surrounding the actual
# parse, then strip them off.
if EMPTY_BRACKETS.match(t):
t = t.strip()[1:-1]
# Replace leaves of the form (!), (,), with (! !), (, ,)
t = re.sub(r"\((.)\)", r"(\1 \1)", t)
# Replace leaves of the form (tag word root) with (tag word)
t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
return t
def _parse(self, t):
try:
return Tree.fromstring(self._normalize(t))
except ValueError as e:
sys.stderr.write("Bad tree detected; trying to recover...\n")
# Try to recover, if we can:
if e.args == ('mismatched parens',):
for n in range(1, 5):
try:
v = Tree(self._normalize(t + ')' * n))
sys.stderr.write(
" Recovered by adding %d close " "paren(s)\n" % n
)
return v
except ValueError:
pass
# Try something else:
sys.stderr.write(" Recovered by returning a flat parse.\n")
# sys.stderr.write(' '.join(t.split())+'\n')
return Tree('S', self._tag(t))
def _tag(self, t, tagset=None):
tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
if tagset and tagset != self._tagset:
tagged_sent = [
(w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent
]
return tagged_sent
def _word(self, t):
return WORD.findall(self._normalize(t))
class CategorizedBracketParseCorpusReader(
CategorizedCorpusReader, BracketParseCorpusReader
):
"""
A reader for parsed corpora whose documents are
divided into categories based on their file identifiers.
@author: Nathan Schneider <nschneid@cs.cmu.edu>
"""
def __init__(self, *args, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
the L{CategorizedCorpusReader constructor
<CategorizedCorpusReader.__init__>}. The remaining arguments
are passed to the L{BracketParseCorpusReader constructor
<BracketParseCorpusReader.__init__>}.
"""
CategorizedCorpusReader.__init__(self, kwargs)
BracketParseCorpusReader.__init__(self, *args, **kwargs)
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
raise ValueError('Specify fileids or categories, not both')
if categories is not None:
return self.fileids(categories)
else:
return fileids
def raw(self, fileids=None, categories=None):
return BracketParseCorpusReader.raw(self, self._resolve(fileids, categories))
def words(self, fileids=None, categories=None):
return BracketParseCorpusReader.words(self, self._resolve(fileids, categories))
def sents(self, fileids=None, categories=None):
return BracketParseCorpusReader.sents(self, self._resolve(fileids, categories))
def paras(self, fileids=None, categories=None):
return BracketParseCorpusReader.paras(self, self._resolve(fileids, categories))
def tagged_words(self, fileids=None, categories=None, tagset=None):
return BracketParseCorpusReader.tagged_words(
self, self._resolve(fileids, categories), tagset
)
def tagged_sents(self, fileids=None, categories=None, tagset=None):
return BracketParseCorpusReader.tagged_sents(
self, self._resolve(fileids, categories), tagset
)
def tagged_paras(self, fileids=None, categories=None, tagset=None):
return BracketParseCorpusReader.tagged_paras(
self, self._resolve(fileids, categories), tagset
)
def parsed_words(self, fileids=None, categories=None):
return BracketParseCorpusReader.parsed_words(
self, self._resolve(fileids, categories)
)
def parsed_sents(self, fileids=None, categories=None):
return BracketParseCorpusReader.parsed_sents(
self, self._resolve(fileids, categories)
)
def parsed_paras(self, fileids=None, categories=None):
return BracketParseCorpusReader.parsed_paras(
self, self._resolve(fileids, categories)
)
class AlpinoCorpusReader(BracketParseCorpusReader):
"""
Reader for the Alpino Dutch Treebank.
This corpus has a lexical breakdown structure embedded, as read by _parse
Unfortunately this puts punctuation and some other words out of the sentence
order in the xml element tree. This is no good for tag_ and word_
_tag and _word will be overridden to use a non-default new parameter 'ordered'
to the overridden _normalize function. The _parse function can then remain
untouched.
"""
def __init__(self, root, encoding='ISO-8859-1', tagset=None):
BracketParseCorpusReader.__init__(
self,
root,
'alpino\.xml',
detect_blocks='blankline',
encoding=encoding,
tagset=tagset,
)
def _normalize(self, t, ordered=False):
"""Normalize the xml sentence element in t.
The sentence elements <alpino_ds>, although embedded in a few overall
xml elements, are seperated by blank lines. That's how the reader can
deliver them one at a time.
Each sentence has a few category subnodes that are of no use to us.
The remaining word nodes may or may not appear in the proper order.
Each word node has attributes, among which:
- begin : the position of the word in the sentence
- pos : Part of Speech: the Tag
- word : the actual word
The return value is a string with all xml elementes replaced by
clauses: either a cat clause with nested clauses, or a word clause.
The order of the bracket clauses closely follows the xml.
If ordered == True, the word clauses include an order sequence number.
If ordered == False, the word clauses only have pos and word parts.
"""
if t[:10] != "<alpino_ds":
return ""
# convert XML to sexpr notation
t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t)
if ordered:
t = re.sub(
r' <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>',
r"(\1 \2 \3)",
t,
)
else:
t = re.sub(r' <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
t = re.sub(r" </node>", r")", t)
t = re.sub(r"<sentence>.*</sentence>", r"", t)
t = re.sub(r"</?alpino_ds.*>", r"", t)
return t
def _tag(self, t, tagset=None):
tagged_sent = [
(int(o), w, p)
for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True))
]
tagged_sent.sort()
if tagset and tagset != self._tagset:
tagged_sent = [
(w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent
]
else:
tagged_sent = [(w, p) for (o, w, p) in tagged_sent]
return tagged_sent
def _word(self, t):
"""Return a correctly ordered list if words"""
tagged_sent = self._tag(t)
return [w for (w, p) in tagged_sent]

View File

@@ -0,0 +1,199 @@
# Natural Language Toolkit: Categorized Sentences Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader structured for corpora that contain one instance on each row.
This CorpusReader is specifically used for the Subjectivity Dataset and the
Sentence Polarity Dataset.
- Subjectivity Dataset information -
Authors: Bo Pang and Lillian Lee.
Url: http://www.cs.cornell.edu/people/pabo/movie-review-data
Distributed with permission.
Related papers:
- Bo Pang and Lillian Lee. "A Sentimental Education: Sentiment Analysis Using
Subjectivity Summarization Based on Minimum Cuts". Proceedings of the ACL,
2004.
- Sentence Polarity Dataset information -
Authors: Bo Pang and Lillian Lee.
Url: http://www.cs.cornell.edu/people/pabo/movie-review-data
Related papers:
- Bo Pang and Lillian Lee. "Seeing stars: Exploiting class relationships for
sentiment categorization with respect to rating scales". Proceedings of the
ACL, 2005.
"""
from six import string_types
from nltk.corpus.reader.api import *
from nltk.tokenize import *
class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader):
"""
A reader for corpora in which each row represents a single instance, mainly
a sentence. Istances are divided into categories based on their file identifiers
(see CategorizedCorpusReader).
Since many corpora allow rows that contain more than one sentence, it is
possible to specify a sentence tokenizer to retrieve all sentences instead
than all rows.
Examples using the Subjectivity Dataset:
>>> from nltk.corpus import subjectivity
>>> subjectivity.sents()[23]
['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits',
'happened', 'off', 'screen', '.']
>>> subjectivity.categories()
['obj', 'subj']
>>> subjectivity.words(categories='subj')
['smart', 'and', 'alert', ',', 'thirteen', ...]
Examples using the Sentence Polarity Dataset:
>>> from nltk.corpus import sentence_polarity
>>> sentence_polarity.sents()
[['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish',
'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find',
'it', 'funny', '.'], ...]
>>> sentence_polarity.categories()
['neg', 'pos']
"""
CorpusView = StreamBackedCorpusView
def __init__(
self,
root,
fileids,
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=None,
encoding='utf8',
**kwargs
):
"""
:param root: The root directory for the corpus.
:param fileids: a list or regexp specifying the fileids in the corpus.
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
into words. Default: `WhitespaceTokenizer`
:param sent_tokenizer: a tokenizer for breaking paragraphs into sentences.
:param encoding: the encoding that should be used to read the corpus.
:param kwargs: additional parameters passed to CategorizedCorpusReader.
"""
CorpusReader.__init__(self, root, fileids, encoding)
CategorizedCorpusReader.__init__(self, kwargs)
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
raise ValueError('Specify fileids or categories, not both')
if categories is not None:
return self.fileids(categories)
else:
return fileids
def raw(self, fileids=None, categories=None):
"""
:param fileids: a list or regexp specifying the fileids that have to be
returned as a raw string.
:param categories: a list specifying the categories whose files have to
be returned as a raw string.
:return: the given file(s) as a single string.
:rtype: str
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def readme(self):
"""
Return the contents of the corpus Readme.txt file.
"""
return self.open("README").read()
def sents(self, fileids=None, categories=None):
"""
Return all sentences in the corpus or in the specified file(s).
:param fileids: a list or regexp specifying the ids of the files whose
sentences have to be returned.
:param categories: a list specifying the categories whose sentences have
to be returned.
:return: the given file(s) as a list of sentences.
Each sentence is tokenized using the specified word_tokenizer.
:rtype: list(list(str))
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def words(self, fileids=None, categories=None):
"""
Return all words and punctuation symbols in the corpus or in the specified
file(s).
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:param categories: a list specifying the categories whose words have to
be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_sent_block(self, stream):
sents = []
for i in range(20): # Read 20 lines at a time.
line = stream.readline()
if not line:
continue
if self._sent_tokenizer:
sents.extend(
[
self._word_tokenizer.tokenize(sent)
for sent in self._sent_tokenizer.tokenize(line)
]
)
else:
sents.append(self._word_tokenizer.tokenize(line))
return sents
def _read_word_block(self, stream):
words = []
for sent in self._read_sent_block(stream):
words.extend(sent)
return words

View File

@@ -0,0 +1,171 @@
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Masato Hagiwara <hagisan@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
from __future__ import print_function
import sys
from six import string_types
from nltk.corpus.reader import util
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
class ChasenCorpusReader(CorpusReader):
def __init__(self, root, fileids, encoding='utf8', sent_splitter=None):
self._sent_splitter = sent_splitter
CorpusReader.__init__(self, root, fileids, encoding)
def raw(self, fileids=None):
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def words(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_words(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_sents(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def paras(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_paras(self, fileids=None):
return concat(
[
ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
class ChasenCorpusView(StreamBackedCorpusView):
"""
A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``,
but this'll use fixed sets of word and sentence tokenizer.
"""
def __init__(
self,
corpus_file,
encoding,
tagged,
group_by_sent,
group_by_para,
sent_splitter=None,
):
self._tagged = tagged
self._group_by_sent = group_by_sent
self._group_by_para = group_by_para
self._sent_splitter = sent_splitter
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def read_block(self, stream):
"""Reads one paragraph at a time."""
block = []
for para_str in read_regexp_block(stream, r".", r"^EOS\n"):
para = []
sent = []
for line in para_str.splitlines():
_eos = line.strip() == 'EOS'
_cells = line.split('\t')
w = (_cells[0], '\t'.join(_cells[1:]))
if not _eos:
sent.append(w)
if _eos or (self._sent_splitter and self._sent_splitter(w)):
if not self._tagged:
sent = [w for (w, t) in sent]
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
sent = []
if len(sent) > 0:
if not self._tagged:
sent = [w for (w, t) in sent]
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
if self._group_by_para:
block.append(para)
else:
block.extend(para)
return block
def demo():
import nltk
from nltk.corpus.util import LazyCorpusLoader
jeita = LazyCorpusLoader('jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
print('/'.join(jeita.words()[22100:22140]))
print(
'\nEOS\n'.join(
'\n'.join("%s/%s" % (w[0], w[1].split('\t')[2]) for w in sent)
for sent in jeita.tagged_sents()[2170:2173]
)
)
def test():
from nltk.corpus.util import LazyCorpusLoader
jeita = LazyCorpusLoader('jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8')
assert isinstance(jeita.tagged_words()[0][1], string_types)
if __name__ == '__main__':
demo()
test()

View File

@@ -0,0 +1,633 @@
# CHILDES XML Corpus Reader
# Copyright (C) 2001-2019 NLTK Project
# Author: Tomonori Nagano <tnagano@gc.cuny.edu>
# Alexis Dimitriadis <A.Dimitriadis@uu.nl>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for the XML version of the CHILDES corpus.
"""
from __future__ import print_function, division
__docformat__ = 'epytext en'
import re
from collections import defaultdict
from six import string_types
from nltk.util import flatten, LazyMap, LazyConcatenation
from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree
# to resolve the namespace issue
NS = 'http://www.talkbank.org/ns/talkbank'
class CHILDESCorpusReader(XMLCorpusReader):
"""
Corpus reader for the XML version of the CHILDES corpus.
The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
(``nltk_data/corpora/CHILDES/``).
For access to the file text use the usual nltk functions,
``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
"""
def __init__(self, root, fileids, lazy=True):
XMLCorpusReader.__init__(self, root, fileids)
self._lazy = lazy
def words(
self,
fileids=None,
speaker='ALL',
stem=False,
relation=False,
strip_space=True,
replace=False,
):
"""
:return: the given file(s) as a list of words
:rtype: list(str)
:param speaker: If specified, select specific speaker(s) defined
in the corpus. Default is 'ALL' (all participants). Common choices
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
researchers)
:param stem: If true, then use word stems instead of word strings.
:param relation: If true, then return tuples of (stem, index,
dependent_index)
:param strip_space: If true, then strip trailing spaces from word
tokens. Otherwise, leave the spaces on the tokens.
:param replace: If true, then use the replaced (intended) word instead
of the original word (e.g., 'wat' will be replaced with 'watch')
"""
sent = None
pos = False
if not self._lazy:
return [
self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
for fileid in self.abspaths(fileids)
]
get_words = lambda fileid: self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
def tagged_words(
self,
fileids=None,
speaker='ALL',
stem=False,
relation=False,
strip_space=True,
replace=False,
):
"""
:return: the given file(s) as a list of tagged
words and punctuation symbols, encoded as tuples
``(word,tag)``.
:rtype: list(tuple(str,str))
:param speaker: If specified, select specific speaker(s) defined
in the corpus. Default is 'ALL' (all participants). Common choices
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
researchers)
:param stem: If true, then use word stems instead of word strings.
:param relation: If true, then return tuples of (stem, index,
dependent_index)
:param strip_space: If true, then strip trailing spaces from word
tokens. Otherwise, leave the spaces on the tokens.
:param replace: If true, then use the replaced (intended) word instead
of the original word (e.g., 'wat' will be replaced with 'watch')
"""
sent = None
pos = True
if not self._lazy:
return [
self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
for fileid in self.abspaths(fileids)
]
get_words = lambda fileid: self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
def sents(
self,
fileids=None,
speaker='ALL',
stem=False,
relation=None,
strip_space=True,
replace=False,
):
"""
:return: the given file(s) as a list of sentences or utterances, each
encoded as a list of word strings.
:rtype: list(list(str))
:param speaker: If specified, select specific speaker(s) defined
in the corpus. Default is 'ALL' (all participants). Common choices
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
researchers)
:param stem: If true, then use word stems instead of word strings.
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
If there is manually-annotated relation info, it will return
tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
:param strip_space: If true, then strip trailing spaces from word
tokens. Otherwise, leave the spaces on the tokens.
:param replace: If true, then use the replaced (intended) word instead
of the original word (e.g., 'wat' will be replaced with 'watch')
"""
sent = True
pos = False
if not self._lazy:
return [
self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
for fileid in self.abspaths(fileids)
]
get_words = lambda fileid: self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
def tagged_sents(
self,
fileids=None,
speaker='ALL',
stem=False,
relation=None,
strip_space=True,
replace=False,
):
"""
:return: the given file(s) as a list of
sentences, each encoded as a list of ``(word,tag)`` tuples.
:rtype: list(list(tuple(str,str)))
:param speaker: If specified, select specific speaker(s) defined
in the corpus. Default is 'ALL' (all participants). Common choices
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
researchers)
:param stem: If true, then use word stems instead of word strings.
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
If there is manually-annotated relation info, it will return
tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
:param strip_space: If true, then strip trailing spaces from word
tokens. Otherwise, leave the spaces on the tokens.
:param replace: If true, then use the replaced (intended) word instead
of the original word (e.g., 'wat' will be replaced with 'watch')
"""
sent = True
pos = True
if not self._lazy:
return [
self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
for fileid in self.abspaths(fileids)
]
get_words = lambda fileid: self._get_words(
fileid, speaker, sent, stem, relation, pos, strip_space, replace
)
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
def corpus(self, fileids=None):
"""
:return: the given file(s) as a dict of ``(corpus_property_key, value)``
:rtype: list(dict)
"""
if not self._lazy:
return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)]
return LazyMap(self._get_corpus, self.abspaths(fileids))
def _get_corpus(self, fileid):
results = dict()
xmldoc = ElementTree.parse(fileid).getroot()
for key, value in xmldoc.items():
results[key] = value
return results
def participants(self, fileids=None):
"""
:return: the given file(s) as a dict of
``(participant_property_key, value)``
:rtype: list(dict)
"""
if not self._lazy:
return [self._get_participants(fileid) for fileid in self.abspaths(fileids)]
return LazyMap(self._get_participants, self.abspaths(fileids))
def _get_participants(self, fileid):
# multidimensional dicts
def dictOfDicts():
return defaultdict(dictOfDicts)
xmldoc = ElementTree.parse(fileid).getroot()
# getting participants' data
pat = dictOfDicts()
for participant in xmldoc.findall(
'.//{%s}Participants/{%s}participant' % (NS, NS)
):
for (key, value) in participant.items():
pat[participant.get('id')][key] = value
return pat
def age(self, fileids=None, speaker='CHI', month=False):
"""
:return: the given file(s) as string or int
:rtype: list or int
:param month: If true, return months instead of year-month-date
"""
if not self._lazy:
return [
self._get_age(fileid, speaker, month)
for fileid in self.abspaths(fileids)
]
get_age = lambda fileid: self._get_age(fileid, speaker, month)
return LazyMap(get_age, self.abspaths(fileids))
def _get_age(self, fileid, speaker, month):
xmldoc = ElementTree.parse(fileid).getroot()
for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS, NS)):
try:
if pat.get('id') == speaker:
age = pat.get('age')
if month:
age = self.convert_age(age)
return age
# some files don't have age data
except (TypeError, AttributeError) as e:
return None
def convert_age(self, age_year):
"Caclculate age in months from a string in CHILDES format"
m = re.match("P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
age_month = int(m.group(1)) * 12 + int(m.group(2))
try:
if int(m.group(3)) > 15:
age_month += 1
# some corpora don't have age information?
except ValueError as e:
pass
return age_month
def MLU(self, fileids=None, speaker='CHI'):
"""
:return: the given file(s) as a floating number
:rtype: list(float)
"""
if not self._lazy:
return [
self._getMLU(fileid, speaker=speaker)
for fileid in self.abspaths(fileids)
]
get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
return LazyMap(get_MLU, self.abspaths(fileids))
def _getMLU(self, fileid, speaker):
sents = self._get_words(
fileid,
speaker=speaker,
sent=True,
stem=True,
relation=False,
pos=True,
strip_space=True,
replace=True,
)
results = []
lastSent = []
numFillers = 0
sentDiscount = 0
for sent in sents:
posList = [pos for (word, pos) in sent]
# if any part of the sentence is intelligible
if any(pos == 'unk' for pos in posList):
continue
# if the sentence is null
elif sent == []:
continue
# if the sentence is the same as the last sent
elif sent == lastSent:
continue
else:
results.append([word for (word, pos) in sent])
# count number of fillers
if len(set(['co', None]).intersection(posList)) > 0:
numFillers += posList.count('co')
numFillers += posList.count(None)
sentDiscount += 1
lastSent = sent
try:
thisWordList = flatten(results)
# count number of morphemes
# (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
numWords = (
len(flatten([word.split('-') for word in thisWordList])) - numFillers
)
numSents = len(results) - sentDiscount
mlu = numWords / numSents
except ZeroDivisionError:
mlu = 0
# return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
return mlu
def _get_words(
self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
):
if (
isinstance(speaker, string_types) and speaker != 'ALL'
): # ensure we have a list of speakers
speaker = [speaker]
xmldoc = ElementTree.parse(fileid).getroot()
# processing each xml doc
results = []
for xmlsent in xmldoc.findall('.//{%s}u' % NS):
sents = []
# select speakers
if speaker == 'ALL' or xmlsent.get('who') in speaker:
for xmlword in xmlsent.findall('.//{%s}w' % NS):
infl = None
suffixStem = None
suffixTag = None
# getting replaced words
if replace and xmlsent.find('.//{%s}w/{%s}replacement' % (NS, NS)):
xmlword = xmlsent.find(
'.//{%s}w/{%s}replacement/{%s}w' % (NS, NS, NS)
)
elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS)):
xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS))
# get text
if xmlword.text:
word = xmlword.text
else:
word = ''
# strip tailing space
if strip_space:
word = word.strip()
# stem
if relation or stem:
try:
xmlstem = xmlword.find('.//{%s}stem' % NS)
word = xmlstem.text
except AttributeError as e:
pass
# if there is an inflection
try:
xmlinfl = xmlword.find(
'.//{%s}mor/{%s}mw/{%s}mk' % (NS, NS, NS)
)
word += '-' + xmlinfl.text
except:
pass
# if there is a suffix
try:
xmlsuffix = xmlword.find(
'.//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem'
% (NS, NS, NS, NS)
)
suffixStem = xmlsuffix.text
except AttributeError:
suffixStem = ""
if suffixStem:
word += "~" + suffixStem
# pos
if relation or pos:
try:
xmlpos = xmlword.findall(".//{%s}c" % NS)
xmlpos2 = xmlword.findall(".//{%s}s" % NS)
if xmlpos2 != []:
tag = xmlpos[0].text + ":" + xmlpos2[0].text
else:
tag = xmlpos[0].text
except (AttributeError, IndexError) as e:
tag = ""
try:
xmlsuffixpos = xmlword.findall(
'.//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c'
% (NS, NS, NS, NS, NS)
)
xmlsuffixpos2 = xmlword.findall(
'.//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s'
% (NS, NS, NS, NS, NS)
)
if xmlsuffixpos2:
suffixTag = (
xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
)
else:
suffixTag = xmlsuffixpos[0].text
except:
pass
if suffixTag:
tag += "~" + suffixTag
word = (word, tag)
# relational
# the gold standard is stored in
# <mor></mor><mor type="trn"><gra type="grt">
if relation == True:
for xmlstem_rel in xmlword.findall(
'.//{%s}mor/{%s}gra' % (NS, NS)
):
if not xmlstem_rel.get('type') == 'grt':
word = (
word[0],
word[1],
xmlstem_rel.get('index')
+ "|"
+ xmlstem_rel.get('head')
+ "|"
+ xmlstem_rel.get('relation'),
)
else:
word = (
word[0],
word[1],
word[2],
word[0],
word[1],
xmlstem_rel.get('index')
+ "|"
+ xmlstem_rel.get('head')
+ "|"
+ xmlstem_rel.get('relation'),
)
try:
for xmlpost_rel in xmlword.findall(
'.//{%s}mor/{%s}mor-post/{%s}gra' % (NS, NS, NS)
):
if not xmlpost_rel.get('type') == 'grt':
suffixStem = (
suffixStem[0],
suffixStem[1],
xmlpost_rel.get('index')
+ "|"
+ xmlpost_rel.get('head')
+ "|"
+ xmlpost_rel.get('relation'),
)
else:
suffixStem = (
suffixStem[0],
suffixStem[1],
suffixStem[2],
suffixStem[0],
suffixStem[1],
xmlpost_rel.get('index')
+ "|"
+ xmlpost_rel.get('head')
+ "|"
+ xmlpost_rel.get('relation'),
)
except:
pass
sents.append(word)
if sent or relation:
results.append(sents)
else:
results.extend(sents)
return LazyMap(lambda x: x, results)
# Ready-to-use browser opener
"""
The base URL for viewing files on the childes website. This
shouldn't need to be changed, unless CHILDES changes the configuration
of their server or unless the user sets up their own corpus webserver.
"""
childes_url_base = r'https://childes.talkbank.org/browser/index.php?url='
def webview_file(self, fileid, urlbase=None):
"""Map a corpus file to its web version on the CHILDES website,
and open it in a web browser.
The complete URL to be used is:
childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')
If no urlbase is passed, we try to calculate it. This
requires that the childes corpus was set up to mirror the
folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
nltk_data/corpora/childes/Eng-USA/Cornell/??? or
nltk_data/corpora/childes/Romance/Spanish/Aguirre/???
The function first looks (as a special case) if "Eng-USA" is
on the path consisting of <corpus root>+fileid; then if
"childes", possibly followed by "data-xml", appears. If neither
one is found, we use the unmodified fileid and hope for the best.
If this is not right, specify urlbase explicitly, e.g., if the
corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
"""
import webbrowser
if urlbase:
path = urlbase + "/" + fileid
else:
full = self.root + "/" + fileid
full = re.sub(r'\\', '/', full)
if '/childes/' in full.lower():
# Discard /data-xml/ if present
path = re.findall(r'(?i)/childes(?:/data-xml)?/(.*)\.xml', full)[0]
elif 'eng-usa' in full.lower():
path = 'Eng-USA/' + re.findall(r'/(?i)Eng-USA/(.*)\.xml', full)[0]
else:
path = fileid
# Strip ".xml" and add ".cha", as necessary:
if path.endswith('.xml'):
path = path[:-4]
if not path.endswith('.cha'):
path = path + '.cha'
url = self.childes_url_base + path
webbrowser.open_new_tab(url)
print("Opening in browser:", url)
# Pausing is a good idea, but it's up to the user...
# raw_input("Hit Return to continue")
def demo(corpus_root=None):
"""
The CHILDES corpus should be manually downloaded and saved
to ``[NLTK_Data_Dir]/corpora/childes/``
"""
if not corpus_root:
from nltk.data import find
corpus_root = find('corpora/childes/data-xml/Eng-USA/')
try:
childes = CHILDESCorpusReader(corpus_root, '.*.xml')
# describe all corpus
for file in childes.fileids()[:5]:
corpus = ''
corpus_id = ''
for (key, value) in childes.corpus(file)[0].items():
if key == "Corpus":
corpus = value
if key == "Id":
corpus_id = value
print('Reading', corpus, corpus_id, ' .....')
print("words:", childes.words(file)[:7], "...")
print(
"words with replaced words:",
childes.words(file, replace=True)[:7],
" ...",
)
print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
print("words (only MOT):", childes.words(file, speaker='MOT')[:7], "...")
print("words (only CHI):", childes.words(file, speaker='CHI')[:7], "...")
print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
print(
"words with relations and pos-tag:",
childes.words(file, relation=True)[:5],
" ...",
)
print("sentence:", childes.sents(file)[:2], " ...")
for (participant, values) in childes.participants(file)[0].items():
for (key, value) in values.items():
print("\tparticipant", participant, key, ":", value)
print("num of sent:", len(childes.sents(file)))
print("num of morphemes:", len(childes.words(file, stem=True)))
print("age:", childes.age(file))
print("age in month:", childes.age(file, month=True))
print("MLU:", childes.MLU(file))
print()
except LookupError as e:
print(
"""The CHILDES corpus, or the parts you need, should be manually
downloaded from https://childes.talkbank.org/data-xml/ and saved at
[NLTK_Data_Dir]/corpora/childes/
Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
demo('/path/to/childes/data-xml/Eng-USA/")
"""
)
# corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
# corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
##this fails
# childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
if __name__ == "__main__":
demo()

View File

@@ -0,0 +1,285 @@
# Natural Language Toolkit: Chunked Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
A reader for corpora that contain chunked (and optionally tagged)
documents.
"""
import os.path, codecs
from six import string_types
import nltk
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
from nltk.tree import Tree
from nltk.tokenize import *
from nltk.chunk import tagstr2tree
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
class ChunkedCorpusReader(CorpusReader):
"""
Reader for chunked (and optionally tagged) corpora. Paragraphs
are split using a block reader. They are then tokenized into
sentences using a sentence tokenizer. Finally, these sentences
are parsed into chunk trees using a string-to-chunktree conversion
function. Each of these steps can be performed using a default
function or a custom function. By default, paragraphs are split
on blank lines; sentences are listed one per line; and sentences
are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
"""
def __init__(
self,
root,
fileids,
extension='',
str2chunktree=tagstr2tree,
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
para_block_reader=read_blankline_block,
encoding='utf8',
tagset=None,
):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset)
"""Arguments for corpus views generated by this corpus: a tuple
(str2chunktree, sent_tokenizer, para_block_tokenizer)"""
def raw(self, fileids=None):
"""
:return: the given file(s) as a single string.
:rtype: str
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
for (f, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
"""
return concat(
[
ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
for (f, enc) in self.abspaths(fileids, True)
]
)
def paras(self, fileids=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
"""
return concat(
[
ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
for (f, enc) in self.abspaths(fileids, True)
]
)
def tagged_words(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of tagged
words and punctuation symbols, encoded as tuples
``(word,tag)``.
:rtype: list(tuple(str,str))
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def tagged_sents(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
sentences, each encoded as a list of ``(word,tag)`` tuples.
:rtype: list(list(tuple(str,str)))
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def tagged_paras(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of ``(word,tag)`` tuples.
:rtype: list(list(list(tuple(str,str))))
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def chunked_words(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of tagged
words and chunks. Words are encoded as ``(word, tag)``
tuples (if the corpus has tags) or word strings (if the
corpus has no tags). Chunks are encoded as depth-one
trees over ``(word,tag)`` tuples or word strings.
:rtype: list(tuple(str,str) and Tree)
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def chunked_sents(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
sentences, each encoded as a shallow Tree. The leaves
of these trees are encoded as ``(word, tag)`` tuples (if
the corpus has tags) or word strings (if the corpus has no
tags).
:rtype: list(Tree)
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def chunked_paras(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as a shallow Tree. The leaves of these
trees are encoded as ``(word, tag)`` tuples (if the corpus
has tags) or word strings (if the corpus has no tags).
:rtype: list(list(Tree))
"""
return concat(
[
ChunkedCorpusView(
f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset
)
for (f, enc) in self.abspaths(fileids, True)
]
)
def _read_block(self, stream):
return [tagstr2tree(t) for t in read_blankline_block(stream)]
class ChunkedCorpusView(StreamBackedCorpusView):
def __init__(
self,
fileid,
encoding,
tagged,
group_by_sent,
group_by_para,
chunked,
str2chunktree,
sent_tokenizer,
para_block_reader,
source_tagset=None,
target_tagset=None,
):
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
self._tagged = tagged
self._group_by_sent = group_by_sent
self._group_by_para = group_by_para
self._chunked = chunked
self._str2chunktree = str2chunktree
self._sent_tokenizer = sent_tokenizer
self._para_block_reader = para_block_reader
self._source_tagset = source_tagset
self._target_tagset = target_tagset
def read_block(self, stream):
block = []
for para_str in self._para_block_reader(stream):
para = []
for sent_str in self._sent_tokenizer.tokenize(para_str):
sent = self._str2chunktree(
sent_str,
source_tagset=self._source_tagset,
target_tagset=self._target_tagset,
)
# If requested, throw away the tags.
if not self._tagged:
sent = self._untag(sent)
# If requested, throw away the chunks.
if not self._chunked:
sent = sent.leaves()
# Add the sentence to `para`.
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
# Add the paragraph to `block`.
if self._group_by_para:
block.append(para)
else:
block.extend(para)
# Return the block
return block
def _untag(self, tree):
for i, child in enumerate(tree):
if isinstance(child, Tree):
self._untag(child)
elif isinstance(child, tuple):
tree[i] = child[0]
else:
raise ValueError('expected child to be Tree or tuple')
return tree

View File

@@ -0,0 +1,99 @@
# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
ftp://ftp.cs.cmu.edu/project/speech/dict/
Copyright 1998 Carnegie Mellon University
File Format: Each line consists of an uppercased word, a counter
(for alternative pronunciations), and a transcription. Vowels are
marked for stress (1=primary, 2=secondary, 0=no stress). E.g.:
NATURAL 1 N AE1 CH ER0 AH0 L
The dictionary contains 127069 entries. Of these, 119400 words are assigned
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
three or more pronunciations. Many of these are fast-speech variants.
Phonemes: There are 39 phonemes, as shown below:
Phoneme Example Translation Phoneme Example Translation
------- ------- ----------- ------- ------- -----------
AA odd AA D AE at AE T
AH hut HH AH T AO ought AO T
AW cow K AW AY hide HH AY D
B be B IY CH cheese CH IY Z
D dee D IY DH thee DH IY
EH Ed EH D ER hurt HH ER T
EY ate EY T F fee F IY
G green G R IY N HH he HH IY
IH it IH T IY eat IY T
JH gee JH IY K key K IY
L lee L IY M me M IY
N knee N IY NG ping P IH NG
OW oat OW T OY toy T OY
P pee P IY R read R IY D
S sea S IY SH she SH IY
T tea T IY TH theta TH EY T AH
UH hood HH UH D UW two T UW
V vee V IY W we W IY
Y yield Y IY L D Z zee Z IY
ZH seizure S IY ZH ER
"""
from nltk import compat
from nltk.util import Index
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
class CMUDictCorpusReader(CorpusReader):
def entries(self):
"""
:return: the cmudict lexicon as a list of entries
containing (word, transcriptions) tuples.
"""
return concat(
[
StreamBackedCorpusView(fileid, read_cmudict_block, encoding=enc)
for fileid, enc in self.abspaths(None, True)
]
)
def raw(self):
"""
:return: the cmudict lexicon as a raw string.
"""
fileids = self._fileids
if isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def words(self):
"""
:return: a list of all words defined in the cmudict lexicon.
"""
return [word.lower() for (word, _) in self.entries()]
def dict(self):
"""
:return: the cmudict lexicon as a dictionary, whose keys are
lowercase words and whose values are lists of pronunciations.
"""
return dict(Index(self.entries()))
def read_cmudict_block(stream):
entries = []
while len(entries) < 100: # Read 100 at a time.
line = stream.readline()
if line == '':
return entries # end of file.
pieces = line.split()
entries.append((pieces[0].lower(), pieces[2:]))
return entries

View File

@@ -0,0 +1,328 @@
# Natural Language Toolkit: Comparative Sentence Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for the Comparative Sentence Dataset.
- Comparative Sentence Dataset information -
Annotated by: Nitin Jindal and Bing Liu, 2006.
Department of Computer Sicence
University of Illinois at Chicago
Contact: Nitin Jindal, njindal@cs.uic.edu
Bing Liu, liub@cs.uic.edu (http://www.cs.uic.edu/~liub)
Distributed with permission.
Related papers:
- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
Proceedings of the ACM SIGIR International Conference on Information Retrieval
(SIGIR-06), 2006.
- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
Proceedings of Twenty First National Conference on Artificial Intelligence
(AAAI-2006), 2006.
- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
Proceedings of the 22nd International Conference on Computational Linguistics
(Coling-2008), Manchester, 18-22 August, 2008.
"""
import re
from six import string_types
from nltk.corpus.reader.api import *
from nltk.tokenize import *
# Regular expressions for dataset components
STARS = re.compile(r'^\*+$')
COMPARISON = re.compile(r'<cs-[1234]>')
CLOSE_COMPARISON = re.compile(r'</cs-[1234]>')
GRAD_COMPARISON = re.compile(r'<cs-[123]>')
NON_GRAD_COMPARISON = re.compile(r'<cs-4>')
ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
KEYWORD = re.compile(r'\((?!.*\()(.*)\)$')
class Comparison(object):
"""
A Comparison represents a comparative sentence and its constituents.
"""
def __init__(
self,
text=None,
comp_type=None,
entity_1=None,
entity_2=None,
feature=None,
keyword=None,
):
"""
:param text: a string (optionally tokenized) containing a comparation.
:param comp_type: an integer defining the type of comparison expressed.
Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
4 (Non-gradable).
:param entity_1: the first entity considered in the comparison relation.
:param entity_2: the second entity considered in the comparison relation.
:param feature: the feature considered in the comparison relation.
:param keyword: the word or phrase which is used for that comparative relation.
"""
self.text = text
self.comp_type = comp_type
self.entity_1 = entity_1
self.entity_2 = entity_2
self.feature = feature
self.keyword = keyword
def __repr__(self):
return (
"Comparison(text=\"{}\", comp_type={}, entity_1=\"{}\", entity_2=\"{}\", "
"feature=\"{}\", keyword=\"{}\")"
).format(
self.text,
self.comp_type,
self.entity_1,
self.entity_2,
self.feature,
self.keyword,
)
class ComparativeSentencesCorpusReader(CorpusReader):
"""
Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).
>>> from nltk.corpus import comparative_sentences
>>> comparison = comparative_sentences.comparisons()[0]
>>> comparison.text
['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
'had', '.']
>>> comparison.entity_2
'models'
>>> (comparison.feature, comparison.keyword)
('rewind', 'more')
>>> len(comparative_sentences.comparisons())
853
"""
CorpusView = StreamBackedCorpusView
def __init__(
self,
root,
fileids,
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=None,
encoding='utf8',
):
"""
:param root: The root directory for this corpus.
:param fileids: a list or regexp specifying the fileids in this corpus.
:param word_tokenizer: tokenizer for breaking sentences or paragraphs
into words. Default: `WhitespaceTokenizer`
:param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
:param encoding: the encoding that should be used to read the corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
def comparisons(self, fileids=None):
"""
Return all comparisons in the corpus.
:param fileids: a list or regexp specifying the ids of the files whose
comparisons have to be returned.
:return: the given file(s) as a list of Comparison objects.
:rtype: list(Comparison)
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_comparison_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def keywords(self, fileids=None):
"""
Return a set of all keywords used in the corpus.
:param fileids: a list or regexp specifying the ids of the files whose
keywords have to be returned.
:return: the set of keywords and comparative phrases used in the corpus.
:rtype: set(str)
"""
all_keywords = concat(
[
self.CorpusView(path, self._read_keyword_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
keywords_set = set(keyword.lower() for keyword in all_keywords if keyword)
return keywords_set
def keywords_readme(self):
"""
Return the list of words and constituents considered as clues of a
comparison (from listOfkeywords.txt).
"""
keywords = []
raw_text = self.open("listOfkeywords.txt").read()
for line in raw_text.split("\n"):
if not line or line.startswith("//"):
continue
keywords.append(line.strip())
return keywords
def raw(self, fileids=None):
"""
:param fileids: a list or regexp specifying the fileids that have to be
returned as a raw string.
:return: the given file(s) as a single string.
:rtype: str
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def readme(self):
"""
Return the contents of the corpus readme file.
"""
return self.open("README.txt").read()
def sents(self, fileids=None):
"""
Return all sentences in the corpus.
:param fileids: a list or regexp specifying the ids of the files whose
sentences have to be returned.
:return: all sentences of the corpus as lists of tokens (or as plain
strings, if no word tokenizer is specified).
:rtype: list(list(str)) or list(str)
"""
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def words(self, fileids=None):
"""
Return all words and punctuation symbols in the corpus.
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_comparison_block(self, stream):
while True:
line = stream.readline()
if not line:
return [] # end of file.
comparison_tags = re.findall(COMPARISON, line)
if comparison_tags:
grad_comparisons = re.findall(GRAD_COMPARISON, line)
non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line)
# Advance to the next line (it contains the comparative sentence)
comparison_text = stream.readline().strip()
if self._word_tokenizer:
comparison_text = self._word_tokenizer.tokenize(comparison_text)
# Skip the next line (it contains closing comparison tags)
stream.readline()
# If gradable comparisons are found, create Comparison instances
# and populate their fields
comparison_bundle = []
if grad_comparisons:
# Each comparison tag has its own relations on a separate line
for comp in grad_comparisons:
comp_type = int(re.match(r'<cs-(\d)>', comp).group(1))
comparison = Comparison(
text=comparison_text, comp_type=comp_type
)
line = stream.readline()
entities_feats = ENTITIES_FEATS.findall(line)
if entities_feats:
for (code, entity_feat) in entities_feats:
if code == '1':
comparison.entity_1 = entity_feat.strip()
elif code == '2':
comparison.entity_2 = entity_feat.strip()
elif code == '3':
comparison.feature = entity_feat.strip()
keyword = KEYWORD.findall(line)
if keyword:
comparison.keyword = keyword[0]
comparison_bundle.append(comparison)
# If non-gradable comparisons are found, create a simple Comparison
# instance for each one
if non_grad_comparisons:
for comp in non_grad_comparisons:
# comp_type in this case should always be 4.
comp_type = int(re.match(r'<cs-(\d)>', comp).group(1))
comparison = Comparison(
text=comparison_text, comp_type=comp_type
)
comparison_bundle.append(comparison)
# Flatten the list of comparisons before returning them
# return concat([comparison_bundle])
return comparison_bundle
def _read_keyword_block(self, stream):
keywords = []
for comparison in self._read_comparison_block(stream):
keywords.append(comparison.keyword)
return keywords
def _read_sent_block(self, stream):
while True:
line = stream.readline()
if re.match(STARS, line):
while True:
line = stream.readline()
if re.match(STARS, line):
break
continue
if (
not re.findall(COMPARISON, line)
and not ENTITIES_FEATS.findall(line)
and not re.findall(CLOSE_COMPARISON, line)
):
if self._sent_tokenizer:
return [
self._word_tokenizer.tokenize(sent)
for sent in self._sent_tokenizer.tokenize(line)
]
else:
return [self._word_tokenizer.tokenize(line)]
def _read_word_block(self, stream):
words = []
for sent in self._read_sent_block(stream):
words.extend(sent)
return words

View File

@@ -0,0 +1,592 @@
# Natural Language Toolkit: CONLL Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Read CoNLL-style chunk fileids.
"""
from __future__ import unicode_literals
import textwrap
from nltk import compat
from nltk.tree import Tree
from nltk.util import LazyMap, LazyConcatenation
from nltk.tag import map_tag
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
class ConllCorpusReader(CorpusReader):
"""
A corpus reader for CoNLL-style files. These files consist of a
series of sentences, separated by blank lines. Each sentence is
encoded using a table (or "grid") of values, where each line
corresponds to a single word, and each column corresponds to an
annotation type. The set of columns used by CoNLL-style files can
vary from corpus to corpus; the ``ConllCorpusReader`` constructor
therefore takes an argument, ``columntypes``, which is used to
specify the columns that are used by a given corpus. By default
columns are split by consecutive whitespaces, with the
``separator`` argument you can set a string to split by (e.g.
``\'\t\'``).
@todo: Add support for reading from corpora where different
parallel files contain different columns.
@todo: Possibly add caching of the grid corpus view? This would
allow the same grid view to be used by different data access
methods (eg words() and parsed_sents() could both share the
same grid corpus view object).
@todo: Better support for -DOCSTART-. Currently, we just ignore
it, but it could be used to define methods that retrieve a
document at a time (eg parsed_documents()).
"""
# /////////////////////////////////////////////////////////////////
# Column Types
# /////////////////////////////////////////////////////////////////
WORDS = 'words' #: column type for words
POS = 'pos' #: column type for part-of-speech tags
TREE = 'tree' #: column type for parse trees
CHUNK = 'chunk' #: column type for chunk structures
NE = 'ne' #: column type for named entities
SRL = 'srl' #: column type for semantic role labels
IGNORE = 'ignore' #: column type for column that should be ignored
#: A list of all column types supported by the conll corpus reader.
COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
# /////////////////////////////////////////////////////////////////
# Constructor
# /////////////////////////////////////////////////////////////////
def __init__(
self,
root,
fileids,
columntypes,
chunk_types=None,
root_label='S',
pos_in_tree=False,
srl_includes_roleset=True,
encoding='utf8',
tree_class=Tree,
tagset=None,
separator=None,
):
for columntype in columntypes:
if columntype not in self.COLUMN_TYPES:
raise ValueError('Bad column type %r' % columntype)
if isinstance(chunk_types, string_types):
chunk_types = [chunk_types]
self._chunk_types = chunk_types
self._colmap = dict((c, i) for (i, c) in enumerate(columntypes))
self._pos_in_tree = pos_in_tree
self._root_label = root_label # for chunks
self._srl_includes_roleset = srl_includes_roleset
self._tree_class = tree_class
CorpusReader.__init__(self, root, fileids, encoding)
self._tagset = tagset
self.sep = separator
# /////////////////////////////////////////////////////////////////
# Data Access Methods
# /////////////////////////////////////////////////////////////////
def raw(self, fileids=None):
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def words(self, fileids=None):
self._require(self.WORDS)
return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids)))
def sents(self, fileids=None):
self._require(self.WORDS)
return LazyMap(self._get_words, self._grids(fileids))
def tagged_words(self, fileids=None, tagset=None):
self._require(self.WORDS, self.POS)
def get_tagged_words(grid):
return self._get_tagged_words(grid, tagset)
return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids)))
def tagged_sents(self, fileids=None, tagset=None):
self._require(self.WORDS, self.POS)
def get_tagged_words(grid):
return self._get_tagged_words(grid, tagset)
return LazyMap(get_tagged_words, self._grids(fileids))
def chunked_words(self, fileids=None, chunk_types=None, tagset=None):
self._require(self.WORDS, self.POS, self.CHUNK)
if chunk_types is None:
chunk_types = self._chunk_types
def get_chunked_words(grid): # capture chunk_types as local var
return self._get_chunked_words(grid, chunk_types, tagset)
return LazyConcatenation(LazyMap(get_chunked_words, self._grids(fileids)))
def chunked_sents(self, fileids=None, chunk_types=None, tagset=None):
self._require(self.WORDS, self.POS, self.CHUNK)
if chunk_types is None:
chunk_types = self._chunk_types
def get_chunked_words(grid): # capture chunk_types as local var
return self._get_chunked_words(grid, chunk_types, tagset)
return LazyMap(get_chunked_words, self._grids(fileids))
def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None):
self._require(self.WORDS, self.POS, self.TREE)
if pos_in_tree is None:
pos_in_tree = self._pos_in_tree
def get_parsed_sent(grid): # capture pos_in_tree as local var
return self._get_parsed_sent(grid, pos_in_tree, tagset)
return LazyMap(get_parsed_sent, self._grids(fileids))
def srl_spans(self, fileids=None):
self._require(self.SRL)
return LazyMap(self._get_srl_spans, self._grids(fileids))
def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True):
self._require(self.WORDS, self.POS, self.TREE, self.SRL)
if pos_in_tree is None:
pos_in_tree = self._pos_in_tree
def get_srl_instances(grid): # capture pos_in_tree as local var
return self._get_srl_instances(grid, pos_in_tree)
result = LazyMap(get_srl_instances, self._grids(fileids))
if flatten:
result = LazyConcatenation(result)
return result
def iob_words(self, fileids=None, tagset=None):
"""
:return: a list of word/tag/IOB tuples
:rtype: list(tuple)
:param fileids: the list of fileids that make up this corpus
:type fileids: None or str or list
"""
self._require(self.WORDS, self.POS, self.CHUNK)
def get_iob_words(grid):
return self._get_iob_words(grid, tagset)
return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
def iob_sents(self, fileids=None, tagset=None):
"""
:return: a list of lists of word/tag/IOB tuples
:rtype: list(list)
:param fileids: the list of fileids that make up this corpus
:type fileids: None or str or list
"""
self._require(self.WORDS, self.POS, self.CHUNK)
def get_iob_words(grid):
return self._get_iob_words(grid, tagset)
return LazyMap(get_iob_words, self._grids(fileids))
# /////////////////////////////////////////////////////////////////
# Grid Reading
# /////////////////////////////////////////////////////////////////
def _grids(self, fileids=None):
# n.b.: we could cache the object returned here (keyed on
# fileids), which would let us reuse the same corpus view for
# different things (eg srl and parse trees).
return concat(
[
StreamBackedCorpusView(fileid, self._read_grid_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def _read_grid_block(self, stream):
grids = []
for block in read_blankline_block(stream):
block = block.strip()
if not block:
continue
grid = [line.split(self.sep) for line in block.split('\n')]
# If there's a docstart row, then discard. ([xx] eventually it
# would be good to actually use it)
if grid[0][self._colmap.get('words', 0)] == '-DOCSTART-':
del grid[0]
# Check that the grid is consistent.
for row in grid:
if len(row) != len(grid[0]):
raise ValueError('Inconsistent number of columns:\n%s' % block)
grids.append(grid)
return grids
# /////////////////////////////////////////////////////////////////
# Transforms
# /////////////////////////////////////////////////////////////////
# given a grid, transform it into some representation (e.g.,
# a list of words or a parse tree).
def _get_words(self, grid):
return self._get_column(grid, self._colmap['words'])
def _get_tagged_words(self, grid, tagset=None):
pos_tags = self._get_column(grid, self._colmap['pos'])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
return list(zip(self._get_column(grid, self._colmap['words']), pos_tags))
def _get_iob_words(self, grid, tagset=None):
pos_tags = self._get_column(grid, self._colmap['pos'])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
return list(
zip(
self._get_column(grid, self._colmap['words']),
pos_tags,
self._get_column(grid, self._colmap['chunk']),
)
)
def _get_chunked_words(self, grid, chunk_types, tagset=None):
# n.b.: this method is very similar to conllstr2tree.
words = self._get_column(grid, self._colmap['words'])
pos_tags = self._get_column(grid, self._colmap['pos'])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
chunk_tags = self._get_column(grid, self._colmap['chunk'])
stack = [Tree(self._root_label, [])]
for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags):
if chunk_tag == 'O':
state, chunk_type = 'O', ''
else:
(state, chunk_type) = chunk_tag.split('-')
# If it's a chunk we don't care about, treat it as O.
if chunk_types is not None and chunk_type not in chunk_types:
state = 'O'
# Treat a mismatching I like a B.
if state == 'I' and chunk_type != stack[-1].label():
state = 'B'
# For B or I: close any open chunks
if state in 'BO' and len(stack) == 2:
stack.pop()
# For B: start a new chunk.
if state == 'B':
new_chunk = Tree(chunk_type, [])
stack[-1].append(new_chunk)
stack.append(new_chunk)
# Add the word token.
stack[-1].append((word, pos_tag))
return stack[0]
def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
words = self._get_column(grid, self._colmap['words'])
pos_tags = self._get_column(grid, self._colmap['pos'])
if tagset and tagset != self._tagset:
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
parse_tags = self._get_column(grid, self._colmap['tree'])
treestr = ''
for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags):
if word == '(':
word = '-LRB-'
if word == ')':
word = '-RRB-'
if pos_tag == '(':
pos_tag = '-LRB-'
if pos_tag == ')':
pos_tag = '-RRB-'
(left, right) = parse_tag.split('*')
right = right.count(')') * ')' # only keep ')'.
treestr += '%s (%s %s) %s' % (left, pos_tag, word, right)
try:
tree = self._tree_class.fromstring(treestr)
except (ValueError, IndexError):
tree = self._tree_class.fromstring('(%s %s)' % (self._root_label, treestr))
if not pos_in_tree:
for subtree in tree.subtrees():
for i, child in enumerate(subtree):
if (
isinstance(child, Tree)
and len(child) == 1
and isinstance(child[0], string_types)
):
subtree[i] = (child[0], child.label())
return tree
def _get_srl_spans(self, grid):
"""
list of list of (start, end), tag) tuples
"""
if self._srl_includes_roleset:
predicates = self._get_column(grid, self._colmap['srl'] + 1)
start_col = self._colmap['srl'] + 2
else:
predicates = self._get_column(grid, self._colmap['srl'])
start_col = self._colmap['srl'] + 1
# Count how many predicates there are. This tells us how many
# columns to expect for SRL data.
num_preds = len([p for p in predicates if p != '-'])
spanlists = []
for i in range(num_preds):
col = self._get_column(grid, start_col + i)
spanlist = []
stack = []
for wordnum, srl_tag in enumerate(col):
(left, right) = srl_tag.split('*')
for tag in left.split('('):
if tag:
stack.append((tag, wordnum))
for i in range(right.count(')')):
(tag, start) = stack.pop()
spanlist.append(((start, wordnum + 1), tag))
spanlists.append(spanlist)
return spanlists
def _get_srl_instances(self, grid, pos_in_tree):
tree = self._get_parsed_sent(grid, pos_in_tree)
spanlists = self._get_srl_spans(grid)
if self._srl_includes_roleset:
predicates = self._get_column(grid, self._colmap['srl'] + 1)
rolesets = self._get_column(grid, self._colmap['srl'])
else:
predicates = self._get_column(grid, self._colmap['srl'])
rolesets = [None] * len(predicates)
instances = ConllSRLInstanceList(tree)
for wordnum, predicate in enumerate(predicates):
if predicate == '-':
continue
# Decide which spanlist to use. Don't assume that they're
# sorted in the same order as the predicates (even though
# they usually are).
for spanlist in spanlists:
for (start, end), tag in spanlist:
if wordnum in range(start, end) and tag in ('V', 'C-V'):
break
else:
continue
break
else:
raise ValueError('No srl column found for %r' % predicate)
instances.append(
ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist)
)
return instances
# /////////////////////////////////////////////////////////////////
# Helper Methods
# /////////////////////////////////////////////////////////////////
def _require(self, *columntypes):
for columntype in columntypes:
if columntype not in self._colmap:
raise ValueError(
'This corpus does not contain a %s ' 'column.' % columntype
)
@staticmethod
def _get_column(grid, column_index):
return [grid[i][column_index] for i in range(len(grid))]
@compat.python_2_unicode_compatible
class ConllSRLInstance(object):
"""
An SRL instance from a CoNLL corpus, which identifies and
providing labels for the arguments of a single verb.
"""
# [xx] add inst.core_arguments, inst.argm_arguments?
def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):
self.verb = []
"""A list of the word indices of the words that compose the
verb whose arguments are identified by this instance.
This will contain multiple word indices when multi-word
verbs are used (e.g. 'turn on')."""
self.verb_head = verb_head
"""The word index of the head word of the verb whose arguments
are identified by this instance. E.g., for a sentence that
uses the verb 'turn on,' ``verb_head`` will be the word index
of the word 'turn'."""
self.verb_stem = verb_stem
self.roleset = roleset
self.arguments = []
"""A list of ``(argspan, argid)`` tuples, specifying the location
and type for each of the arguments identified by this
instance. ``argspan`` is a tuple ``start, end``, indicating
that the argument consists of the ``words[start:end]``."""
self.tagged_spans = tagged_spans
"""A list of ``(span, id)`` tuples, specifying the location and
type for each of the arguments, as well as the verb pieces,
that make up this instance."""
self.tree = tree
"""The parse tree for the sentence containing this instance."""
self.words = tree.leaves()
"""A list of the words in the sentence containing this
instance."""
# Fill in the self.verb and self.arguments values.
for (start, end), tag in tagged_spans:
if tag in ('V', 'C-V'):
self.verb += list(range(start, end))
else:
self.arguments.append(((start, end), tag))
def __repr__(self):
# Originally, its:
##plural = 's' if len(self.arguments) != 1 else ''
plural = 's' if len(self.arguments) != 1 else ''
return '<ConllSRLInstance for %r with %d argument%s>' % (
(self.verb_stem, len(self.arguments), plural)
)
def pprint(self):
verbstr = ' '.join(self.words[i][0] for i in self.verb)
hdr = 'SRL for %r (stem=%r):\n' % (verbstr, self.verb_stem)
s = ''
for i, word in enumerate(self.words):
if isinstance(word, tuple):
word = word[0]
for (start, end), argid in self.arguments:
if i == start:
s += '[%s ' % argid
if i == end:
s += '] '
if i in self.verb:
word = '<<%s>>' % word
s += word + ' '
return hdr + textwrap.fill(
s.replace(' ]', ']'), initial_indent=' ', subsequent_indent=' '
)
@compat.python_2_unicode_compatible
class ConllSRLInstanceList(list):
"""
Set of instances for a single sentence
"""
def __init__(self, tree, instances=()):
self.tree = tree
list.__init__(self, instances)
def __str__(self):
return self.pprint()
def pprint(self, include_tree=False):
# Sanity check: trees should be the same
for inst in self:
if inst.tree != self.tree:
raise ValueError('Tree mismatch!')
# If desired, add trees:
if include_tree:
words = self.tree.leaves()
pos = [None] * len(words)
synt = ['*'] * len(words)
self._tree2conll(self.tree, 0, words, pos, synt)
s = ''
for i in range(len(words)):
# optional tree columns
if include_tree:
s += '%-20s ' % words[i]
s += '%-8s ' % pos[i]
s += '%15s*%-8s ' % tuple(synt[i].split('*'))
# verb head column
for inst in self:
if i == inst.verb_head:
s += '%-20s ' % inst.verb_stem
break
else:
s += '%-20s ' % '-'
# Remaining columns: self
for inst in self:
argstr = '*'
for (start, end), argid in inst.tagged_spans:
if i == start:
argstr = '(%s%s' % (argid, argstr)
if i == (end - 1):
argstr += ')'
s += '%-12s ' % argstr
s += '\n'
return s
def _tree2conll(self, tree, wordnum, words, pos, synt):
assert isinstance(tree, Tree)
if len(tree) == 1 and isinstance(tree[0], string_types):
pos[wordnum] = tree.label()
assert words[wordnum] == tree[0]
return wordnum + 1
elif len(tree) == 1 and isinstance(tree[0], tuple):
assert len(tree[0]) == 2
pos[wordnum], pos[wordnum] = tree[0]
return wordnum + 1
else:
synt[wordnum] = '(%s%s' % (tree.label(), synt[wordnum])
for child in tree:
wordnum = self._tree2conll(child, wordnum, words, pos, synt)
synt[wordnum - 1] += ')'
return wordnum
class ConllChunkCorpusReader(ConllCorpusReader):
"""
A ConllCorpusReader whose data file contains three columns: words,
pos, and chunk.
"""
def __init__(
self, root, fileids, chunk_types, encoding='utf8', tagset=None, separator=None
):
ConllCorpusReader.__init__(
self,
root,
fileids,
('words', 'pos', 'chunk'),
chunk_types=chunk_types,
encoding=encoding,
tagset=tagset,
separator=separator,
)

View File

@@ -0,0 +1,119 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: An Crubadan N-grams Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Avital Pekker <avital.pekker@utoronto.ca>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
An NLTK interface for the n-gram statistics gathered from
the corpora for each language using An Crubadan.
There are multiple potential applications for the data but
this reader was created with the goal of using it in the
context of language identification.
For details about An Crubadan, this data, and its potential uses, see:
http://borel.slu.edu/crubadan/index.html
"""
from __future__ import print_function, unicode_literals
import re
from os import path
from nltk.compat import PY3
from nltk.corpus.reader import CorpusReader
from nltk.probability import FreqDist
from nltk.data import ZipFilePathPointer
class CrubadanCorpusReader(CorpusReader):
"""
A corpus reader used to access language An Crubadan n-gram files.
"""
_LANG_MAPPER_FILE = 'table.txt'
_all_lang_freq = {}
def __init__(self, root, fileids, encoding='utf8', tagset=None):
super(CrubadanCorpusReader, self).__init__(root, fileids, encoding='utf8')
self._lang_mapping_data = []
self._load_lang_mapping_data()
def lang_freq(self, lang):
''' Return n-gram FreqDist for a specific language
given ISO 639-3 language code '''
if lang not in self._all_lang_freq:
self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
return self._all_lang_freq[lang]
def langs(self):
''' Return a list of supported languages as ISO 639-3 codes '''
return [row[1] for row in self._lang_mapping_data]
def iso_to_crubadan(self, lang):
''' Return internal Crubadan code based on ISO 639-3 code '''
for i in self._lang_mapping_data:
if i[1].lower() == lang.lower():
return i[0]
def crubadan_to_iso(self, lang):
''' Return ISO 639-3 code given internal Crubadan code '''
for i in self._lang_mapping_data:
if i[0].lower() == lang.lower():
return i[1]
def _load_lang_mapping_data(self):
''' Load language mappings between codes and description from table.txt '''
if isinstance(self.root, ZipFilePathPointer):
raise RuntimeError(
"Please install the 'crubadan' corpus first, use nltk.download()"
)
mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
if self._LANG_MAPPER_FILE not in self.fileids():
raise RuntimeError("Could not find language mapper file: " + mapper_file)
if PY3:
raw = open(mapper_file, 'r', encoding='utf-8').read().strip()
else:
raw = open(mapper_file, 'rU').read().decode('utf-8').strip()
self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')]
def _load_lang_ngrams(self, lang):
''' Load single n-gram language file given the ISO 639-3 language code
and return its FreqDist '''
if lang not in self.langs():
raise RuntimeError("Unsupported language.")
crubadan_code = self.iso_to_crubadan(lang)
ngram_file = path.join(self.root, crubadan_code + '-3grams.txt')
if not path.isfile(ngram_file):
raise RuntimeError("No N-gram file found for requested language.")
counts = FreqDist()
if PY3:
f = open(ngram_file, 'r', encoding='utf-8')
else:
f = open(ngram_file, 'rU')
for line in f:
if PY3:
data = line.split(' ')
else:
data = line.decode('utf8').split(' ')
ngram = data[1].strip('\n')
freq = int(data[0])
counts[ngram] = freq
return counts

View File

@@ -0,0 +1,134 @@
# Natural Language Toolkit: Dependency Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Kepa Sarasola <kepa.sarasola@ehu.es>
# Iker Manterola <returntothehangar@hotmail.com>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import codecs
from nltk.parse import DependencyGraph
from nltk.tokenize import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
class DependencyCorpusReader(SyntaxCorpusReader):
def __init__(
self,
root,
fileids,
encoding='utf8',
word_tokenizer=TabTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
para_block_reader=read_blankline_block,
):
# FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
# from CorpusReader?
CorpusReader.__init__(self, root, fileids, encoding)
#########################################################
def raw(self, fileids=None):
"""
:return: the given file(s) as a single string.
:rtype: str
"""
result = []
for fileid, encoding in self.abspaths(fileids, include_encoding=True):
if isinstance(fileid, PathPointer):
result.append(fileid.open(encoding=encoding).read())
else:
with codecs.open(fileid, "r", encoding) as fp:
result.append(fp.read())
return concat(result)
def words(self, fileids=None):
return concat(
[
DependencyCorpusView(fileid, False, False, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
def tagged_words(self, fileids=None):
return concat(
[
DependencyCorpusView(fileid, True, False, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
def sents(self, fileids=None):
return concat(
[
DependencyCorpusView(fileid, False, True, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
def tagged_sents(self, fileids=None):
return concat(
[
DependencyCorpusView(fileid, True, True, False, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
def parsed_sents(self, fileids=None):
sents = concat(
[
DependencyCorpusView(fileid, False, True, True, encoding=enc)
for fileid, enc in self.abspaths(fileids, include_encoding=True)
]
)
return [DependencyGraph(sent) for sent in sents]
class DependencyCorpusView(StreamBackedCorpusView):
_DOCSTART = '-DOCSTART- -DOCSTART- O\n' # dokumentu hasiera definitzen da
def __init__(
self,
corpus_file,
tagged,
group_by_sent,
dependencies,
chunk_types=None,
encoding='utf8',
):
self._tagged = tagged
self._dependencies = dependencies
self._group_by_sent = group_by_sent
self._chunk_types = chunk_types
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def read_block(self, stream):
# Read the next sentence.
sent = read_blankline_block(stream)[0].strip()
# Strip off the docstart marker, if present.
if sent.startswith(self._DOCSTART):
sent = sent[len(self._DOCSTART) :].lstrip()
# extract word and tag from any of the formats
if not self._dependencies:
lines = [line.split('\t') for line in sent.split('\n')]
if len(lines[0]) == 3 or len(lines[0]) == 4:
sent = [(line[0], line[1]) for line in lines]
elif len(lines[0]) == 10:
sent = [(line[1], line[4]) for line in lines]
else:
raise ValueError('Unexpected number of fields in dependency tree file')
# discard tags if they weren't requested
if not self._tagged:
sent = [word for (word, tag) in sent]
# Return the result.
if self._group_by_sent:
return [sent]
else:
return list(sent)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,129 @@
# Natural Language Toolkit: IEER Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for the Information Extraction and Entity Recognition Corpus.
NIST 1999 Information Extraction: Entity Recognition Evaluation
http://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm
This corpus contains the NEWSWIRE development test data for the
NIST 1999 IE-ER Evaluation. The files were taken from the
subdirectory: /ie_er_99/english/devtest/newswire/*.ref.nwt
and filenames were shortened.
The corpus contains the following files: APW_19980314, APW_19980424,
APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407.
"""
from __future__ import unicode_literals
from six import string_types
import nltk
from nltk import compat
from nltk.corpus.reader.api import *
#: A dictionary whose keys are the names of documents in this corpus;
#: and whose values are descriptions of those documents' contents.
titles = {
'APW_19980314': 'Associated Press Weekly, 14 March 1998',
'APW_19980424': 'Associated Press Weekly, 24 April 1998',
'APW_19980429': 'Associated Press Weekly, 29 April 1998',
'NYT_19980315': 'New York Times, 15 March 1998',
'NYT_19980403': 'New York Times, 3 April 1998',
'NYT_19980407': 'New York Times, 7 April 1998',
}
#: A list of all documents in this corpus.
documents = sorted(titles)
@compat.python_2_unicode_compatible
class IEERDocument(object):
def __init__(self, text, docno=None, doctype=None, date_time=None, headline=''):
self.text = text
self.docno = docno
self.doctype = doctype
self.date_time = date_time
self.headline = headline
def __repr__(self):
if self.headline:
headline = ' '.join(self.headline.leaves())
else:
headline = (
' '.join([w for w in self.text.leaves() if w[:1] != '<'][:12]) + '...'
)
if self.docno is not None:
return '<IEERDocument %s: %r>' % (self.docno, headline)
else:
return '<IEERDocument: %r>' % headline
class IEERCorpusReader(CorpusReader):
"""
"""
def raw(self, fileids=None):
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def docs(self, fileids=None):
return concat(
[
StreamBackedCorpusView(fileid, self._read_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def parsed_docs(self, fileids=None):
return concat(
[
StreamBackedCorpusView(fileid, self._read_parsed_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def _read_parsed_block(self, stream):
# TODO: figure out while empty documents are being returned
return [
self._parse(doc)
for doc in self._read_block(stream)
if self._parse(doc).docno is not None
]
def _parse(self, doc):
val = nltk.chunk.ieerstr2tree(doc, root_label="DOCUMENT")
if isinstance(val, dict):
return IEERDocument(**val)
else:
return IEERDocument(val)
def _read_block(self, stream):
out = []
# Skip any preamble.
while True:
line = stream.readline()
if not line:
break
if line.strip() == '<DOC>':
break
out.append(line)
# Read the document
while True:
line = stream.readline()
if not line:
break
out.append(line)
if line.strip() == '</DOC>':
break
# Return the document
return ['\n'.join(out)]

View File

@@ -0,0 +1,103 @@
# Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Indian Language POS-Tagged Corpus
Collected by A Kumaran, Microsoft Research, India
Distributed with permission
Contents:
- Bangla: IIT Kharagpur
- Hindi: Microsoft Research India
- Marathi: IIT Bombay
- Telugu: IIIT Hyderabad
"""
from six import string_types
from nltk.tag import str2tuple, map_tag
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
class IndianCorpusReader(CorpusReader):
"""
List of words, one per line. Blank lines are ignored.
"""
def words(self, fileids=None):
return concat(
[
IndianCorpusView(fileid, enc, False, False)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_words(self, fileids=None, tagset=None):
if tagset and tagset != self._tagset:
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
else:
tag_mapping_function = None
return concat(
[
IndianCorpusView(fileid, enc, True, False, tag_mapping_function)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
return concat(
[
IndianCorpusView(fileid, enc, False, True)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_sents(self, fileids=None, tagset=None):
if tagset and tagset != self._tagset:
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
else:
tag_mapping_function = None
return concat(
[
IndianCorpusView(fileid, enc, True, True, tag_mapping_function)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def raw(self, fileids=None):
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
class IndianCorpusView(StreamBackedCorpusView):
def __init__(
self, corpus_file, encoding, tagged, group_by_sent, tag_mapping_function=None
):
self._tagged = tagged
self._group_by_sent = group_by_sent
self._tag_mapping_function = tag_mapping_function
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def read_block(self, stream):
line = stream.readline()
if line.startswith('<'):
return []
sent = [str2tuple(word, sep='_') for word in line.split()]
if self._tag_mapping_function:
sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
if not self._tagged:
sent = [w for (w, t) in sent]
if self._group_by_sent:
return [sent]
else:
return sent

View File

@@ -0,0 +1,368 @@
# Natural Language Toolkit: IPI PAN Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Konrad Goluchowski <kodie@mimuw.edu.pl>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import functools
from six import string_types
from nltk.corpus.reader.util import StreamBackedCorpusView, concat
from nltk.corpus.reader.api import CorpusReader
def _parse_args(fun):
@functools.wraps(fun)
def decorator(self, fileids=None, **kwargs):
kwargs.pop('tags', None)
if not fileids:
fileids = self.fileids()
return fun(self, fileids, **kwargs)
return decorator
class IPIPANCorpusReader(CorpusReader):
"""
Corpus reader designed to work with corpus created by IPI PAN.
See http://korpus.pl/en/ for more details about IPI PAN corpus.
The corpus includes information about text domain, channel and categories.
You can access possible values using ``domains()``, ``channels()`` and
``categories()``. You can use also this metadata to filter files, e.g.:
``fileids(channel='prasa')``, ``fileids(categories='publicystyczny')``.
The reader supports methods: words, sents, paras and their tagged versions.
You can get part of speech instead of full tag by giving "simplify_tags=True"
parameter, e.g.: ``tagged_sents(simplify_tags=True)``.
Also you can get all tags disambiguated tags specifying parameter
"one_tag=False", e.g.: ``tagged_paras(one_tag=False)``.
You can get all tags that were assigned by a morphological analyzer specifying
parameter "disamb_only=False", e.g. ``tagged_words(disamb_only=False)``.
The IPIPAN Corpus contains tags indicating if there is a space between two
tokens. To add special "no space" markers, you should specify parameter
"append_no_space=True", e.g. ``tagged_words(append_no_space=True)``.
As a result in place where there should be no space between two tokens new
pair ('', 'no-space') will be inserted (for tagged data) and just '' for
methods without tags.
The corpus reader can also try to append spaces between words. To enable this
option, specify parameter "append_space=True", e.g. ``words(append_space=True)``.
As a result either ' ' or (' ', 'space') will be inserted between tokens.
By default, xml entities like &quot; and &amp; are replaced by corresponding
characters. You can turn off this feature, specifying parameter
"replace_xmlentities=False", e.g. ``words(replace_xmlentities=False)``.
"""
def __init__(self, root, fileids):
CorpusReader.__init__(self, root, fileids, None, None)
def raw(self, fileids=None):
if not fileids:
fileids = self.fileids()
filecontents = []
for fileid in self._list_morph_files(fileids):
with open(fileid, 'r') as infile:
filecontents.append(infile.read())
return ''.join(filecontents)
def channels(self, fileids=None):
if not fileids:
fileids = self.fileids()
return self._parse_header(fileids, 'channel')
def domains(self, fileids=None):
if not fileids:
fileids = self.fileids()
return self._parse_header(fileids, 'domain')
def categories(self, fileids=None):
if not fileids:
fileids = self.fileids()
return [
self._map_category(cat) for cat in self._parse_header(fileids, 'keyTerm')
]
def fileids(self, channels=None, domains=None, categories=None):
if channels is not None and domains is not None and categories is not None:
raise ValueError(
'You can specify only one of channels, domains '
'and categories parameter at once'
)
if channels is None and domains is None and categories is None:
return CorpusReader.fileids(self)
if isinstance(channels, string_types):
channels = [channels]
if isinstance(domains, string_types):
domains = [domains]
if isinstance(categories, string_types):
categories = [categories]
if channels:
return self._list_morph_files_by('channel', channels)
elif domains:
return self._list_morph_files_by('domain', domains)
else:
return self._list_morph_files_by(
'keyTerm', categories, map=self._map_category
)
@_parse_args
def sents(self, fileids=None, **kwargs):
return concat(
[
self._view(
fileid, mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs
)
for fileid in self._list_morph_files(fileids)
]
)
@_parse_args
def paras(self, fileids=None, **kwargs):
return concat(
[
self._view(
fileid, mode=IPIPANCorpusView.PARAS_MODE, tags=False, **kwargs
)
for fileid in self._list_morph_files(fileids)
]
)
@_parse_args
def words(self, fileids=None, **kwargs):
return concat(
[
self._view(fileid, tags=False, **kwargs)
for fileid in self._list_morph_files(fileids)
]
)
@_parse_args
def tagged_sents(self, fileids=None, **kwargs):
return concat(
[
self._view(fileid, mode=IPIPANCorpusView.SENTS_MODE, **kwargs)
for fileid in self._list_morph_files(fileids)
]
)
@_parse_args
def tagged_paras(self, fileids=None, **kwargs):
return concat(
[
self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE, **kwargs)
for fileid in self._list_morph_files(fileids)
]
)
@_parse_args
def tagged_words(self, fileids=None, **kwargs):
return concat(
[self._view(fileid, **kwargs) for fileid in self._list_morph_files(fileids)]
)
def _list_morph_files(self, fileids):
return [f for f in self.abspaths(fileids)]
def _list_header_files(self, fileids):
return [
f.replace('morph.xml', 'header.xml')
for f in self._list_morph_files(fileids)
]
def _parse_header(self, fileids, tag):
values = set()
for f in self._list_header_files(fileids):
values_list = self._get_tag(f, tag)
for v in values_list:
values.add(v)
return list(values)
def _list_morph_files_by(self, tag, values, map=None):
fileids = self.fileids()
ret_fileids = set()
for f in fileids:
fp = self.abspath(f).replace('morph.xml', 'header.xml')
values_list = self._get_tag(fp, tag)
for value in values_list:
if map is not None:
value = map(value)
if value in values:
ret_fileids.add(f)
return list(ret_fileids)
def _get_tag(self, f, tag):
tags = []
with open(f, 'r') as infile:
header = infile.read()
tag_end = 0
while True:
tag_pos = header.find('<' + tag, tag_end)
if tag_pos < 0:
return tags
tag_end = header.find('</' + tag + '>', tag_pos)
tags.append(header[tag_pos + len(tag) + 2 : tag_end])
def _map_category(self, cat):
pos = cat.find('>')
if pos == -1:
return cat
else:
return cat[pos + 1 :]
def _view(self, filename, **kwargs):
tags = kwargs.pop('tags', True)
mode = kwargs.pop('mode', 0)
simplify_tags = kwargs.pop('simplify_tags', False)
one_tag = kwargs.pop('one_tag', True)
disamb_only = kwargs.pop('disamb_only', True)
append_no_space = kwargs.pop('append_no_space', False)
append_space = kwargs.pop('append_space', False)
replace_xmlentities = kwargs.pop('replace_xmlentities', True)
if len(kwargs) > 0:
raise ValueError('Unexpected arguments: %s' % kwargs.keys())
if not one_tag and not disamb_only:
raise ValueError(
'You cannot specify both one_tag=False and ' 'disamb_only=False'
)
if not tags and (simplify_tags or not one_tag or not disamb_only):
raise ValueError(
'You cannot specify simplify_tags, one_tag or '
'disamb_only with functions other than tagged_*'
)
return IPIPANCorpusView(
filename,
tags=tags,
mode=mode,
simplify_tags=simplify_tags,
one_tag=one_tag,
disamb_only=disamb_only,
append_no_space=append_no_space,
append_space=append_space,
replace_xmlentities=replace_xmlentities,
)
class IPIPANCorpusView(StreamBackedCorpusView):
WORDS_MODE = 0
SENTS_MODE = 1
PARAS_MODE = 2
def __init__(self, filename, startpos=0, **kwargs):
StreamBackedCorpusView.__init__(self, filename, None, startpos, None)
self.in_sentence = False
self.position = 0
self.show_tags = kwargs.pop('tags', True)
self.disamb_only = kwargs.pop('disamb_only', True)
self.mode = kwargs.pop('mode', IPIPANCorpusView.WORDS_MODE)
self.simplify_tags = kwargs.pop('simplify_tags', False)
self.one_tag = kwargs.pop('one_tag', True)
self.append_no_space = kwargs.pop('append_no_space', False)
self.append_space = kwargs.pop('append_space', False)
self.replace_xmlentities = kwargs.pop('replace_xmlentities', True)
def read_block(self, stream):
sentence = []
sentences = []
space = False
no_space = False
tags = set()
lines = self._read_data(stream)
while True:
# we may have only part of last line
if len(lines) <= 1:
self._seek(stream)
lines = self._read_data(stream)
if lines == ['']:
assert not sentences
return []
line = lines.pop()
self.position += len(line) + 1
if line.startswith('<chunk type="s"'):
self.in_sentence = True
elif line.startswith('<chunk type="p"'):
pass
elif line.startswith('<tok'):
if self.append_space and space and not no_space:
self._append_space(sentence)
space = True
no_space = False
orth = ""
tags = set()
elif line.startswith('</chunk'):
if self.in_sentence:
self.in_sentence = False
self._seek(stream)
if self.mode == self.SENTS_MODE:
return [sentence]
elif self.mode == self.WORDS_MODE:
if self.append_space:
self._append_space(sentence)
return sentence
else:
sentences.append(sentence)
elif self.mode == self.PARAS_MODE:
self._seek(stream)
return [sentences]
elif line.startswith('<orth'):
orth = line[6:-7]
if self.replace_xmlentities:
orth = orth.replace('&quot;', '"').replace('&amp;', '&')
elif line.startswith('<lex'):
if not self.disamb_only or line.find('disamb=') != -1:
tag = line[line.index('<ctag') + 6 : line.index('</ctag')]
tags.add(tag)
elif line.startswith('</tok'):
if self.show_tags:
if self.simplify_tags:
tags = [t.split(':')[0] for t in tags]
if not self.one_tag or not self.disamb_only:
sentence.append((orth, tuple(tags)))
else:
sentence.append((orth, tags.pop()))
else:
sentence.append(orth)
elif line.startswith('<ns/>'):
if self.append_space:
no_space = True
if self.append_no_space:
if self.show_tags:
sentence.append(('', 'no-space'))
else:
sentence.append('')
elif line.startswith('</cesAna'):
pass
def _read_data(self, stream):
self.position = stream.tell()
buff = stream.read(4096)
lines = buff.split('\n')
lines.reverse()
return lines
def _seek(self, stream):
stream.seek(self.position)
def _append_space(self, sentence):
if self.show_tags:
sentence.append((' ', 'space'))
else:
sentence.append(' ')

View File

@@ -0,0 +1,194 @@
#! /usr/bin/env python
# KNB Corpus reader
# Copyright (C) 2001-2019 NLTK Project
# Author: Masato Hagiwara <hagisan@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html
from __future__ import print_function
import re
from six import string_types
from nltk.parse import DependencyGraph
from nltk.corpus.reader.util import (
FileSystemPathPointer,
find_corpus_fileids,
read_blankline_block,
)
from nltk.corpus.reader.api import SyntaxCorpusReader, CorpusReader
# default function to convert morphlist to str for tree representation
_morphs2str_default = lambda morphs: '/'.join(m[0] for m in morphs if m[0] != 'EOS')
class KNBCorpusReader(SyntaxCorpusReader):
"""
This class implements:
- ``__init__``, which specifies the location of the corpus
and a method for detecting the sentence blocks in corpus files.
- ``_read_block``, which reads a block from the input stream.
- ``_word``, which takes a block and returns a list of list of words.
- ``_tag``, which takes a block and returns a list of list of tagged
words.
- ``_parse``, which takes a block and returns a list of parsed
sentences.
The structure of tagged words:
tagged_word = (word(str), tags(tuple))
tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)
Usage example
-------------
>>> from nltk.corpus.util import LazyCorpusLoader
>>> knbc = LazyCorpusLoader(
... 'knbc/corpus1',
... KNBCorpusReader,
... r'.*/KN.*',
... encoding='euc-jp',
... )
>>> len(knbc.sents()[0])
9
"""
def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default):
"""
Initialize KNBCorpusReader
morphs2str is a function to convert morphlist to str for tree representation
for _parse()
"""
# FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
# from CorpusReader?
CorpusReader.__init__(self, root, fileids, encoding)
self.morphs2str = morphs2str
def _read_block(self, stream):
# blocks are split by blankline (or EOF) - default
return read_blankline_block(stream)
def _word(self, t):
res = []
for line in t.splitlines():
# ignore the Bunsets headers
if not re.match(r"EOS|\*|\#|\+", line):
cells = line.strip().split(" ")
res.append(cells[0])
return res
# ignores tagset argument
def _tag(self, t, tagset=None):
res = []
for line in t.splitlines():
# ignore the Bunsets headers
if not re.match(r"EOS|\*|\#|\+", line):
cells = line.strip().split(" ")
# convert cells to morph tuples
res.append((cells[0], ' '.join(cells[1:])))
return res
def _parse(self, t):
dg = DependencyGraph()
i = 0
for line in t.splitlines():
if line[0] in '*+':
# start of bunsetsu or tag
cells = line.strip().split(" ", 3)
m = re.match(r"([\-0-9]*)([ADIP])", cells[1])
assert m is not None
node = dg.nodes[i]
node.update({'address': i, 'rel': m.group(2), 'word': []})
dep_parent = int(m.group(1))
if dep_parent == -1:
dg.root = node
else:
dg.nodes[dep_parent]['deps'].append(i)
i += 1
elif line[0] != '#':
# normal morph
cells = line.strip().split(" ")
# convert cells to morph tuples
morph = cells[0], ' '.join(cells[1:])
dg.nodes[i - 1]['word'].append(morph)
if self.morphs2str:
for node in dg.nodes.values():
node['word'] = self.morphs2str(node['word'])
return dg.tree()
######################################################################
# Demo
######################################################################
def demo():
import nltk
from nltk.corpus.util import LazyCorpusLoader
root = nltk.data.find('corpora/knbc/corpus1')
fileids = [
f
for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
]
def _knbc_fileids_sort(x):
cells = x.split('-')
return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))
knbc = LazyCorpusLoader(
'knbc/corpus1',
KNBCorpusReader,
sorted(fileids, key=_knbc_fileids_sort),
encoding='euc-jp',
)
print(knbc.fileids()[:10])
print(''.join(knbc.words()[:100]))
print('\n\n'.join(str(tree) for tree in knbc.parsed_sents()[:2]))
knbc.morphs2str = lambda morphs: '/'.join(
"%s(%s)" % (m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
).encode('utf-8')
print('\n\n'.join('%s' % tree for tree in knbc.parsed_sents()[:2]))
print(
'\n'.join(
' '.join("%s/%s" % (w[0], w[1].split(' ')[2]) for w in sent)
for sent in knbc.tagged_sents()[0:2]
)
)
def test():
from nltk.corpus.util import LazyCorpusLoader
knbc = LazyCorpusLoader(
'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp'
)
assert isinstance(knbc.words()[0], string_types)
assert isinstance(knbc.sents()[0][0], string_types)
assert isinstance(knbc.tagged_words()[0], tuple)
assert isinstance(knbc.tagged_sents()[0][0], tuple)
if __name__ == '__main__':
demo()

View File

@@ -0,0 +1,184 @@
# Natural Language Toolkit: Lin's Thesaurus
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Dan Blanchard <dblanchard@ets.org>
# URL: <http://nltk.org/>
# For license information, see LICENSE.txt
from __future__ import print_function
import re
from collections import defaultdict
from functools import reduce
from nltk.corpus.reader import CorpusReader
class LinThesaurusCorpusReader(CorpusReader):
""" Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin. """
# Compiled regular expression for extracting the key from the first line of each
# thesaurus entry
_key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+')
@staticmethod
def __defaultdict_factory():
''' Factory for creating defaultdict of defaultdict(dict)s '''
return defaultdict(dict)
def __init__(self, root, badscore=0.0):
'''
Initialize the thesaurus.
:param root: root directory containing thesaurus LISP files
:type root: C{string}
:param badscore: the score to give to words which do not appear in each other's sets of synonyms
:type badscore: C{float}
'''
super(LinThesaurusCorpusReader, self).__init__(root, r'sim[A-Z]\.lsp')
self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
self._badscore = badscore
for path, encoding, fileid in self.abspaths(
include_encoding=True, include_fileid=True
):
with open(path) as lin_file:
first = True
for line in lin_file:
line = line.strip()
# Start of entry
if first:
key = LinThesaurusCorpusReader._key_re.sub(r'\1', line)
first = False
# End of entry
elif line == '))':
first = True
# Lines with pairs of ngrams and scores
else:
split_line = line.split('\t')
if len(split_line) == 2:
ngram, score = split_line
self._thesaurus[fileid][key][ngram.strip('"')] = float(
score
)
def similarity(self, ngram1, ngram2, fileid=None):
'''
Returns the similarity score for two ngrams.
:param ngram1: first ngram to compare
:type ngram1: C{string}
:param ngram2: second ngram to compare
:type ngram2: C{string}
:param fileid: thesaurus fileid to search in. If None, search all fileids.
:type fileid: C{string}
:return: If fileid is specified, just the score for the two ngrams; otherwise,
list of tuples of fileids and scores.
'''
# Entries don't contain themselves, so make sure similarity between item and itself is 1.0
if ngram1 == ngram2:
if fileid:
return 1.0
else:
return [(fid, 1.0) for fid in self._fileids]
else:
if fileid:
return (
self._thesaurus[fileid][ngram1][ngram2]
if ngram2 in self._thesaurus[fileid][ngram1]
else self._badscore
)
else:
return [
(
fid,
(
self._thesaurus[fid][ngram1][ngram2]
if ngram2 in self._thesaurus[fid][ngram1]
else self._badscore
),
)
for fid in self._fileids
]
def scored_synonyms(self, ngram, fileid=None):
'''
Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram
:param ngram: ngram to lookup
:type ngram: C{string}
:param fileid: thesaurus fileid to search in. If None, search all fileids.
:type fileid: C{string}
:return: If fileid is specified, list of tuples of scores and synonyms; otherwise,
list of tuples of fileids and lists, where inner lists consist of tuples of
scores and synonyms.
'''
if fileid:
return self._thesaurus[fileid][ngram].items()
else:
return [
(fileid, self._thesaurus[fileid][ngram].items())
for fileid in self._fileids
]
def synonyms(self, ngram, fileid=None):
'''
Returns a list of synonyms for the current ngram.
:param ngram: ngram to lookup
:type ngram: C{string}
:param fileid: thesaurus fileid to search in. If None, search all fileids.
:type fileid: C{string}
:return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and
lists, where inner lists contain synonyms.
'''
if fileid:
return self._thesaurus[fileid][ngram].keys()
else:
return [
(fileid, self._thesaurus[fileid][ngram].keys())
for fileid in self._fileids
]
def __contains__(self, ngram):
'''
Determines whether or not the given ngram is in the thesaurus.
:param ngram: ngram to lookup
:type ngram: C{string}
:return: whether the given ngram is in the thesaurus.
'''
return reduce(
lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]),
self._fileids,
False,
)
######################################################################
# Demo
######################################################################
def demo():
from nltk.corpus import lin_thesaurus as thes
word1 = "business"
word2 = "enterprise"
print("Getting synonyms for " + word1)
print(thes.synonyms(word1))
print("Getting scored synonyms for " + word1)
print(thes.scored_synonyms(word1))
print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
print(thes.synonyms(word1, fileid="simN.lsp"))
print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
print(thes.synonyms(word1, fileid="simN.lsp"))
print("Similarity score for %s and %s:" % (word1, word2))
print(thes.similarity(word1, word2))
if __name__ == '__main__':
demo()

View File

@@ -0,0 +1,414 @@
"""
A reader for corpora whose documents are in MTE format.
"""
import os
import re
from functools import reduce
from six import string_types
from nltk.corpus.reader import concat, TaggedCorpusReader
from nltk.corpus.reader.xmldocs import XMLCorpusView
def xpath(root, path, ns):
return root.findall(path, ns)
class MTECorpusView(XMLCorpusView):
"""
Class for lazy viewing the MTE Corpus.
"""
def __init__(self, fileid, tagspec, elt_handler=None):
XMLCorpusView.__init__(self, fileid, tagspec, elt_handler)
def read_block(self, stream, tagspec=None, elt_handler=None):
return list(
filter(
lambda x: x is not None,
XMLCorpusView.read_block(self, stream, tagspec, elt_handler),
)
)
class MTEFileReader:
"""
Class for loading the content of the multext-east corpus. It
parses the xml files and does some tag-filtering depending on the
given method parameters.
"""
ns = {
'tei': 'http://www.tei-c.org/ns/1.0',
'xml': 'http://www.w3.org/XML/1998/namespace',
}
tag_ns = '{http://www.tei-c.org/ns/1.0}'
xml_ns = '{http://www.w3.org/XML/1998/namespace}'
word_path = "TEI/text/body/div/div/p/s/(w|c)"
sent_path = "TEI/text/body/div/div/p/s"
para_path = "TEI/text/body/div/div/p"
def __init__(self, file_path):
self.__file_path = file_path
@classmethod
def _word_elt(cls, elt, context):
return elt.text
@classmethod
def _sent_elt(cls, elt, context):
return [cls._word_elt(w, None) for w in xpath(elt, '*', cls.ns)]
@classmethod
def _para_elt(cls, elt, context):
return [cls._sent_elt(s, None) for s in xpath(elt, '*', cls.ns)]
@classmethod
def _tagged_word_elt(cls, elt, context):
if 'ana' not in elt.attrib:
return (elt.text, '')
if cls.__tags == "" and cls.__tagset == "msd":
return (elt.text, elt.attrib['ana'])
elif cls.__tags == "" and cls.__tagset == "universal":
return (elt.text, MTETagConverter.msd_to_universal(elt.attrib['ana']))
else:
tags = re.compile('^' + re.sub("-", ".", cls.__tags) + '.*$')
if tags.match(elt.attrib['ana']):
if cls.__tagset == "msd":
return (elt.text, elt.attrib['ana'])
else:
return (
elt.text,
MTETagConverter.msd_to_universal(elt.attrib['ana']),
)
else:
return None
@classmethod
def _tagged_sent_elt(cls, elt, context):
return list(
filter(
lambda x: x is not None,
[cls._tagged_word_elt(w, None) for w in xpath(elt, '*', cls.ns)],
)
)
@classmethod
def _tagged_para_elt(cls, elt, context):
return list(
filter(
lambda x: x is not None,
[cls._tagged_sent_elt(s, None) for s in xpath(elt, '*', cls.ns)],
)
)
@classmethod
def _lemma_word_elt(cls, elt, context):
if 'lemma' not in elt.attrib:
return (elt.text, '')
else:
return (elt.text, elt.attrib['lemma'])
@classmethod
def _lemma_sent_elt(cls, elt, context):
return [cls._lemma_word_elt(w, None) for w in xpath(elt, '*', cls.ns)]
@classmethod
def _lemma_para_elt(cls, elt, context):
return [cls._lemma_sent_elt(s, None) for s in xpath(elt, '*', cls.ns)]
def words(self):
return MTECorpusView(
self.__file_path, MTEFileReader.word_path, MTEFileReader._word_elt
)
def sents(self):
return MTECorpusView(
self.__file_path, MTEFileReader.sent_path, MTEFileReader._sent_elt
)
def paras(self):
return MTECorpusView(
self.__file_path, MTEFileReader.para_path, MTEFileReader._para_elt
)
def lemma_words(self):
return MTECorpusView(
self.__file_path, MTEFileReader.word_path, MTEFileReader._lemma_word_elt
)
def tagged_words(self, tagset, tags):
MTEFileReader.__tagset = tagset
MTEFileReader.__tags = tags
return MTECorpusView(
self.__file_path, MTEFileReader.word_path, MTEFileReader._tagged_word_elt
)
def lemma_sents(self):
return MTECorpusView(
self.__file_path, MTEFileReader.sent_path, MTEFileReader._lemma_sent_elt
)
def tagged_sents(self, tagset, tags):
MTEFileReader.__tagset = tagset
MTEFileReader.__tags = tags
return MTECorpusView(
self.__file_path, MTEFileReader.sent_path, MTEFileReader._tagged_sent_elt
)
def lemma_paras(self):
return MTECorpusView(
self.__file_path, MTEFileReader.para_path, MTEFileReader._lemma_para_elt
)
def tagged_paras(self, tagset, tags):
MTEFileReader.__tagset = tagset
MTEFileReader.__tags = tags
return MTECorpusView(
self.__file_path, MTEFileReader.para_path, MTEFileReader._tagged_para_elt
)
class MTETagConverter:
"""
Class for converting msd tags to universal tags, more conversion
options are currently not implemented.
"""
mapping_msd_universal = {
'A': 'ADJ',
'S': 'ADP',
'R': 'ADV',
'C': 'CONJ',
'D': 'DET',
'N': 'NOUN',
'M': 'NUM',
'Q': 'PRT',
'P': 'PRON',
'V': 'VERB',
'.': '.',
'-': 'X',
}
@staticmethod
def msd_to_universal(tag):
"""
This function converts the annotation from the Multex-East to the universal tagset
as described in Chapter 5 of the NLTK-Book
Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so
"""
indicator = tag[0] if not tag[0] == "#" else tag[1]
if not indicator in MTETagConverter.mapping_msd_universal:
indicator = '-'
return MTETagConverter.mapping_msd_universal[indicator]
class MTECorpusReader(TaggedCorpusReader):
"""
Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East.
MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging
scheme. These tags can be converted to the Universal tagset
"""
def __init__(self, root=None, fileids=None, encoding='utf8'):
"""
Construct a new MTECorpusreader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP
:param root: The root directory for this corpus. (default points to location in multext config file)
:param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
:param enconding: The encoding of the given files (default is utf8)
"""
TaggedCorpusReader.__init__(self, root, fileids, encoding)
def __fileids(self, fileids):
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
# filter wrong userinput
fileids = filter(lambda x: x in self._fileids, fileids)
# filter multext-east sourcefiles that are not compatible to the teip5 specification
fileids = filter(lambda x: x not in ["oana-bg.xml", "oana-mk.xml"], fileids)
if not fileids:
print("No valid multext-east file specified")
return fileids
def readme(self):
"""
Prints some information about this corpus.
:return: the content of the attached README file
:rtype: str
"""
return self.open("00README.txt").read()
def raw(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a single string.
:rtype: str
"""
return reduce([self.open(f).read() for f in self.__fileids(fileids)], [])
def words(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).words()
for f in self.__fileids(fileids)
]
)
def sents(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of sentences or utterances,
each encoded as a list of word strings
:rtype: list(list(str))
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).sents()
for f in self.__fileids(fileids)
]
)
def paras(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of paragraphs, each encoded as a list
of sentences, which are in turn encoded as lists of word string
:rtype: list(list(list(str)))
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).paras()
for f in self.__fileids(fileids)
]
)
def lemma_words(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of words, the corresponding lemmas
and punctuation symbols, encoded as tuples (word, lemma)
:rtype: list(tuple(str,str))
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).lemma_words()
for f in self.__fileids(fileids)
]
)
def tagged_words(self, fileids=None, tagset="msd", tags=""):
"""
:param fileids: A list specifying the fileids that should be used.
:param tagset: The tagset that should be used in the returned object,
either "universal" or "msd", "msd" is the default
:param tags: An MSD Tag that is used to filter all parts of the used corpus
that are not more precise or at least equal to the given tag
:return: the given file(s) as a list of tagged words and punctuation symbols
encoded as tuples (word, tag)
:rtype: list(tuple(str, str))
"""
if tagset == "universal" or tagset == "msd":
return concat(
[
MTEFileReader(os.path.join(self._root, f)).tagged_words(
tagset, tags
)
for f in self.__fileids(fileids)
]
)
else:
print("Unknown tagset specified.")
def lemma_sents(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of sentences or utterances, each
encoded as a list of tuples of the word and the corresponding
lemma (word, lemma)
:rtype: list(list(tuple(str, str)))
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).lemma_sents()
for f in self.__fileids(fileids)
]
)
def tagged_sents(self, fileids=None, tagset="msd", tags=""):
"""
:param fileids: A list specifying the fileids that should be used.
:param tagset: The tagset that should be used in the returned object,
either "universal" or "msd", "msd" is the default
:param tags: An MSD Tag that is used to filter all parts of the used corpus
that are not more precise or at least equal to the given tag
:return: the given file(s) as a list of sentences or utterances, each
each encoded as a list of (word,tag) tuples
:rtype: list(list(tuple(str, str)))
"""
if tagset == "universal" or tagset == "msd":
return concat(
[
MTEFileReader(os.path.join(self._root, f)).tagged_sents(
tagset, tags
)
for f in self.__fileids(fileids)
]
)
else:
print("Unknown tagset specified.")
def lemma_paras(self, fileids=None):
"""
:param fileids: A list specifying the fileids that should be used.
:return: the given file(s) as a list of paragraphs, each encoded as a
list of sentences, which are in turn encoded as a list of
tuples of the word and the corresponding lemma (word, lemma)
:rtype: list(List(List(tuple(str, str))))
"""
return concat(
[
MTEFileReader(os.path.join(self._root, f)).lemma_paras()
for f in self.__fileids(fileids)
]
)
def tagged_paras(self, fileids=None, tagset="msd", tags=""):
"""
:param fileids: A list specifying the fileids that should be used.
:param tagset: The tagset that should be used in the returned object,
either "universal" or "msd", "msd" is the default
:param tags: An MSD Tag that is used to filter all parts of the used corpus
that are not more precise or at least equal to the given tag
:return: the given file(s) as a list of paragraphs, each encoded as a
list of sentences, which are in turn encoded as a list
of (word,tag) tuples
:rtype: list(list(list(tuple(str, str))))
"""
if tagset == "universal" or tagset == "msd":
return concat(
[
MTEFileReader(os.path.join(self._root, f)).tagged_paras(
tagset, tags
)
for f in self.__fileids(fileids)
]
)
else:
print("Unknown tagset specified.")

View File

@@ -0,0 +1,489 @@
# Natural Language Toolkit: NKJP Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Gabriela Kaczka
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
import functools
import os
import re
import tempfile
from six import string_types
from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
def _parse_args(fun):
"""
Wraps function arguments:
if fileids not specified then function set NKJPCorpusReader paths.
"""
@functools.wraps(fun)
def decorator(self, fileids=None, **kwargs):
if not fileids:
fileids = self._paths
return fun(self, fileids, **kwargs)
return decorator
class NKJPCorpusReader(XMLCorpusReader):
WORDS_MODE = 0
SENTS_MODE = 1
HEADER_MODE = 2
RAW_MODE = 3
def __init__(self, root, fileids='.*'):
"""
Corpus reader designed to work with National Corpus of Polish.
See http://nkjp.pl/ for more details about NKJP.
use example:
import nltk
import nkjp
from nkjp import NKJPCorpusReader
x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus
x.header()
x.raw()
x.words()
x.tagged_words(tags=['subst', 'comp']) #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html
x.sents()
x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s)
x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
"""
if isinstance(fileids, string_types):
XMLCorpusReader.__init__(self, root, fileids + '.*/header.xml')
else:
XMLCorpusReader.__init__(
self, root, [fileid + '/header.xml' for fileid in fileids]
)
self._paths = self.get_paths()
def get_paths(self):
return [
os.path.join(str(self._root), f.split("header.xml")[0])
for f in self._fileids
]
def fileids(self):
"""
Returns a list of file identifiers for the fileids that make up
this corpus.
"""
return [f.split("header.xml")[0] for f in self._fileids]
def _view(self, filename, tags=None, **kwargs):
"""
Returns a view specialised for use with particular corpus file.
"""
mode = kwargs.pop('mode', NKJPCorpusReader.WORDS_MODE)
if mode is NKJPCorpusReader.WORDS_MODE:
return NKJPCorpus_Morph_View(filename, tags=tags)
elif mode is NKJPCorpusReader.SENTS_MODE:
return NKJPCorpus_Segmentation_View(filename, tags=tags)
elif mode is NKJPCorpusReader.HEADER_MODE:
return NKJPCorpus_Header_View(filename, tags=tags)
elif mode is NKJPCorpusReader.RAW_MODE:
return NKJPCorpus_Text_View(
filename, tags=tags, mode=NKJPCorpus_Text_View.RAW_MODE
)
else:
raise NameError('No such mode!')
def add_root(self, fileid):
"""
Add root if necessary to specified fileid.
"""
if self.root in fileid:
return fileid
return self.root + fileid
@_parse_args
def header(self, fileids=None, **kwargs):
"""
Returns header(s) of specified fileids.
"""
return concat(
[
self._view(
self.add_root(fileid), mode=NKJPCorpusReader.HEADER_MODE, **kwargs
).handle_query()
for fileid in fileids
]
)
@_parse_args
def sents(self, fileids=None, **kwargs):
"""
Returns sentences in specified fileids.
"""
return concat(
[
self._view(
self.add_root(fileid), mode=NKJPCorpusReader.SENTS_MODE, **kwargs
).handle_query()
for fileid in fileids
]
)
@_parse_args
def words(self, fileids=None, **kwargs):
"""
Returns words in specified fileids.
"""
return concat(
[
self._view(
self.add_root(fileid), mode=NKJPCorpusReader.WORDS_MODE, **kwargs
).handle_query()
for fileid in fileids
]
)
@_parse_args
def tagged_words(self, fileids=None, **kwargs):
"""
Call with specified tags as a list, e.g. tags=['subst', 'comp'].
Returns tagged words in specified fileids.
"""
tags = kwargs.pop('tags', [])
return concat(
[
self._view(
self.add_root(fileid),
mode=NKJPCorpusReader.WORDS_MODE,
tags=tags,
**kwargs
).handle_query()
for fileid in fileids
]
)
@_parse_args
def raw(self, fileids=None, **kwargs):
"""
Returns words in specified fileids.
"""
return concat(
[
self._view(
self.add_root(fileid), mode=NKJPCorpusReader.RAW_MODE, **kwargs
).handle_query()
for fileid in fileids
]
)
class NKJPCorpus_Header_View(XMLCorpusView):
def __init__(self, filename, **kwargs):
"""
HEADER_MODE
A stream backed corpus view specialized for use with
header.xml files in NKJP corpus.
"""
self.tagspec = ".*/sourceDesc$"
XMLCorpusView.__init__(self, filename + 'header.xml', self.tagspec)
def handle_query(self):
self._open()
header = []
while True:
segm = XMLCorpusView.read_block(self, self._stream)
if len(segm) == 0:
break
header.extend(segm)
self.close()
return header
def handle_elt(self, elt, context):
titles = elt.findall('bibl/title')
title = []
if titles:
title = '\n'.join(title.text.strip() for title in titles)
authors = elt.findall('bibl/author')
author = []
if authors:
author = '\n'.join(author.text.strip() for author in authors)
dates = elt.findall('bibl/date')
date = []
if dates:
date = '\n'.join(date.text.strip() for date in dates)
publishers = elt.findall('bibl/publisher')
publisher = []
if publishers:
publisher = '\n'.join(publisher.text.strip() for publisher in publishers)
idnos = elt.findall('bibl/idno')
idno = []
if idnos:
idno = '\n'.join(idno.text.strip() for idno in idnos)
notes = elt.findall('bibl/note')
note = []
if notes:
note = '\n'.join(note.text.strip() for note in notes)
return {
'title': title,
'author': author,
'date': date,
'publisher': publisher,
'idno': idno,
'note': note,
}
class XML_Tool:
"""
Helper class creating xml file to one without references to nkjp: namespace.
That's needed because the XMLCorpusView assumes that one can find short substrings
of XML that are valid XML, which is not true if a namespace is declared at top level
"""
def __init__(self, root, filename):
self.read_file = os.path.join(root, filename)
self.write_file = tempfile.NamedTemporaryFile(delete=False)
def build_preprocessed_file(self):
try:
fr = open(self.read_file, 'r')
fw = self.write_file
line = ' '
while len(line):
line = fr.readline()
x = re.split(r'nkjp:[^ ]* ', line) # in all files
ret = ' '.join(x)
x = re.split('<nkjp:paren>', ret) # in ann_segmentation.xml
ret = ' '.join(x)
x = re.split('</nkjp:paren>', ret) # in ann_segmentation.xml
ret = ' '.join(x)
x = re.split('<choice>', ret) # in ann_segmentation.xml
ret = ' '.join(x)
x = re.split('</choice>', ret) # in ann_segmentation.xml
ret = ' '.join(x)
fw.write(ret)
fr.close()
fw.close()
return self.write_file.name
except Exception:
self.remove_preprocessed_file()
raise Exception
def remove_preprocessed_file(self):
os.remove(self.write_file.name)
class NKJPCorpus_Segmentation_View(XMLCorpusView):
"""
A stream backed corpus view specialized for use with
ann_segmentation.xml files in NKJP corpus.
"""
def __init__(self, filename, **kwargs):
self.tagspec = '.*p/.*s'
# intersperse NKJPCorpus_Text_View
self.text_view = NKJPCorpus_Text_View(
filename, mode=NKJPCorpus_Text_View.SENTS_MODE
)
self.text_view.handle_query()
# xml preprocessing
self.xml_tool = XML_Tool(filename, 'ann_segmentation.xml')
# base class init
XMLCorpusView.__init__(
self, self.xml_tool.build_preprocessed_file(), self.tagspec
)
def get_segm_id(self, example_word):
return example_word.split('(')[1].split(',')[0]
def get_sent_beg(self, beg_word):
# returns index of beginning letter in sentence
return int(beg_word.split(',')[1])
def get_sent_end(self, end_word):
# returns index of end letter in sentence
splitted = end_word.split(')')[0].split(',')
return int(splitted[1]) + int(splitted[2])
def get_sentences(self, sent_segm):
# returns one sentence
id = self.get_segm_id(sent_segm[0])
segm = self.text_view.segm_dict[id] # text segment
beg = self.get_sent_beg(sent_segm[0])
end = self.get_sent_end(sent_segm[len(sent_segm) - 1])
return segm[beg:end]
def remove_choice(self, segm):
ret = []
prev_txt_end = -1
prev_txt_nr = -1
for word in segm:
txt_nr = self.get_segm_id(word)
# get increasing sequence of ids: in case of choice get first possibility
if self.get_sent_beg(word) > prev_txt_end - 1 or prev_txt_nr != txt_nr:
ret.append(word)
prev_txt_end = self.get_sent_end(word)
prev_txt_nr = txt_nr
return ret
def handle_query(self):
try:
self._open()
sentences = []
while True:
sent_segm = XMLCorpusView.read_block(self, self._stream)
if len(sent_segm) == 0:
break
for segm in sent_segm:
segm = self.remove_choice(segm)
sentences.append(self.get_sentences(segm))
self.close()
self.xml_tool.remove_preprocessed_file()
return sentences
except Exception:
self.xml_tool.remove_preprocessed_file()
raise Exception
def handle_elt(self, elt, context):
ret = []
for seg in elt:
ret.append(seg.get('corresp'))
return ret
class NKJPCorpus_Text_View(XMLCorpusView):
"""
A stream backed corpus view specialized for use with
text.xml files in NKJP corpus.
"""
SENTS_MODE = 0
RAW_MODE = 1
def __init__(self, filename, **kwargs):
self.mode = kwargs.pop('mode', 0)
self.tagspec = '.*/div/ab'
self.segm_dict = dict()
# xml preprocessing
self.xml_tool = XML_Tool(filename, 'text.xml')
# base class init
XMLCorpusView.__init__(
self, self.xml_tool.build_preprocessed_file(), self.tagspec
)
def handle_query(self):
try:
self._open()
x = self.read_block(self._stream)
self.close()
self.xml_tool.remove_preprocessed_file()
return x
except Exception:
self.xml_tool.remove_preprocessed_file()
raise Exception
def read_block(self, stream, tagspec=None, elt_handler=None):
"""
Returns text as a list of sentences.
"""
txt = []
while True:
segm = XMLCorpusView.read_block(self, stream)
if len(segm) == 0:
break
for part in segm:
txt.append(part)
return [' '.join([segm for segm in txt])]
def get_segm_id(self, elt):
for attr in elt.attrib:
if attr.endswith('id'):
return elt.get(attr)
def handle_elt(self, elt, context):
# fill dictionary to use later in sents mode
if self.mode is NKJPCorpus_Text_View.SENTS_MODE:
self.segm_dict[self.get_segm_id(elt)] = elt.text
return elt.text
class NKJPCorpus_Morph_View(XMLCorpusView):
"""
A stream backed corpus view specialized for use with
ann_morphosyntax.xml files in NKJP corpus.
"""
def __init__(self, filename, **kwargs):
self.tags = kwargs.pop('tags', None)
self.tagspec = '.*/seg/fs'
self.xml_tool = XML_Tool(filename, 'ann_morphosyntax.xml')
XMLCorpusView.__init__(
self, self.xml_tool.build_preprocessed_file(), self.tagspec
)
def handle_query(self):
try:
self._open()
words = []
while True:
segm = XMLCorpusView.read_block(self, self._stream)
if len(segm) == 0:
break
for part in segm:
if part is not None:
words.append(part)
self.close()
self.xml_tool.remove_preprocessed_file()
return words
except Exception:
self.xml_tool.remove_preprocessed_file()
raise Exception
def handle_elt(self, elt, context):
word = ''
flag = False
is_not_interp = True
# if tags not specified, then always return word
if self.tags is None:
flag = True
for child in elt:
# get word
if 'name' in child.keys() and child.attrib['name'] == 'orth':
for symbol in child:
if symbol.tag == 'string':
word = symbol.text
elif 'name' in child.keys() and child.attrib['name'] == 'interps':
for symbol in child:
if 'type' in symbol.keys() and symbol.attrib['type'] == 'lex':
for symbol2 in symbol:
if (
'name' in symbol2.keys()
and symbol2.attrib['name'] == 'ctag'
):
for symbol3 in symbol2:
if (
'value' in symbol3.keys()
and self.tags is not None
and symbol3.attrib['value'] in self.tags
):
flag = True
elif (
'value' in symbol3.keys()
and symbol3.attrib['value'] == 'interp'
):
is_not_interp = False
if flag and is_not_interp:
return word

View File

@@ -0,0 +1,485 @@
# Natural Language Toolkit: NomBank Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Authors: Paul Bedaride <paul.bedaride@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import unicode_literals
from xml.etree import ElementTree
from functools import total_ordering
from six import string_types
from nltk.tree import Tree
from nltk.internals import raise_unorderable_types
from nltk.compat import python_2_unicode_compatible
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
class NombankCorpusReader(CorpusReader):
"""
Corpus reader for the nombank corpus, which augments the Penn
Treebank with information about the predicate argument structure
of every noun instance. The corpus consists of two parts: the
predicate-argument annotations themselves, and a set of "frameset
files" which define the argument labels used by the annotations,
on a per-noun basis. Each "frameset file" contains one or more
predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
divided into coarse-grained word senses called "rolesets". For
each "roleset", the frameset file provides descriptions of the
argument roles, along with examples.
"""
def __init__(
self,
root,
nomfile,
framefiles='',
nounsfile=None,
parse_fileid_xform=None,
parse_corpus=None,
encoding='utf8',
):
"""
:param root: The root directory for this corpus.
:param nomfile: The name of the file containing the predicate-
argument annotations (relative to ``root``).
:param framefiles: A list or regexp specifying the frameset
fileids for this corpus.
:param parse_fileid_xform: A transform that should be applied
to the fileids in this corpus. This should be a function
of one argument (a fileid) that returns a string (the new
fileid).
:param parse_corpus: The corpus containing the parse trees
corresponding to this corpus. These parse trees are
necessary to resolve the tree pointers used by nombank.
"""
# If framefiles is specified as a regexp, expand it.
if isinstance(framefiles, string_types):
self._fileids = find_corpus_fileids(root, framefiles)
self._fileids = list(framefiles)
# Initialze the corpus reader.
CorpusReader.__init__(self, root, framefiles, encoding)
# Record our nom file & nouns file.
self._nomfile = nomfile
self._nounsfile = nounsfile
self._parse_fileid_xform = parse_fileid_xform
self._parse_corpus = parse_corpus
def raw(self, fileids=None):
"""
:return: the text contents of the given fileids, as a single string.
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def instances(self, baseform=None):
"""
:return: a corpus view that acts as a list of
``NombankInstance`` objects, one for each noun in the corpus.
"""
kwargs = {}
if baseform is not None:
kwargs['instance_filter'] = lambda inst: inst.baseform == baseform
return StreamBackedCorpusView(
self.abspath(self._nomfile),
lambda stream: self._read_instance_block(stream, **kwargs),
encoding=self.encoding(self._nomfile),
)
def lines(self):
"""
:return: a corpus view that acts as a list of strings, one for
each line in the predicate-argument annotation file.
"""
return StreamBackedCorpusView(
self.abspath(self._nomfile),
read_line_block,
encoding=self.encoding(self._nomfile),
)
def roleset(self, roleset_id):
"""
:return: the xml description for the given roleset.
"""
baseform = roleset_id.split('.')[0]
baseform = baseform.replace('perc-sign', '%')
baseform = baseform.replace('oneslashonezero', '1/10').replace(
'1/10', '1-slash-10'
)
framefile = 'frames/%s.xml' % baseform
if framefile not in self.fileids():
raise ValueError('Frameset file for %s not found' % roleset_id)
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
for roleset in etree.findall('predicate/roleset'):
if roleset.attrib['id'] == roleset_id:
return roleset
raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
def rolesets(self, baseform=None):
"""
:return: list of xml descriptions for rolesets.
"""
if baseform is not None:
framefile = 'frames/%s.xml' % baseform
if framefile not in self.fileids():
raise ValueError('Frameset file for %s not found' % baseform)
framefiles = [framefile]
else:
framefiles = self.fileids()
rsets = []
for framefile in framefiles:
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
rsets.append(etree.findall('predicate/roleset'))
return LazyConcatenation(rsets)
def nouns(self):
"""
:return: a corpus view that acts as a list of all noun lemmas
in this corpus (from the nombank.1.0.words file).
"""
return StreamBackedCorpusView(
self.abspath(self._nounsfile),
read_line_block,
encoding=self.encoding(self._nounsfile),
)
def _read_instance_block(self, stream, instance_filter=lambda inst: True):
block = []
# Read 100 at a time.
for i in range(100):
line = stream.readline().strip()
if line:
inst = NombankInstance.parse(
line, self._parse_fileid_xform, self._parse_corpus
)
if instance_filter(inst):
block.append(inst)
return block
######################################################################
# { Nombank Instance & related datatypes
######################################################################
@python_2_unicode_compatible
class NombankInstance(object):
def __init__(
self,
fileid,
sentnum,
wordnum,
baseform,
sensenumber,
predicate,
predid,
arguments,
parse_corpus=None,
):
self.fileid = fileid
"""The name of the file containing the parse tree for this
instance's sentence."""
self.sentnum = sentnum
"""The sentence number of this sentence within ``fileid``.
Indexing starts from zero."""
self.wordnum = wordnum
"""The word number of this instance's predicate within its
containing sentence. Word numbers are indexed starting from
zero, and include traces and other empty parse elements."""
self.baseform = baseform
"""The baseform of the predicate."""
self.sensenumber = sensenumber
"""The sense number of the predicate."""
self.predicate = predicate
"""A ``NombankTreePointer`` indicating the position of this
instance's predicate within its containing sentence."""
self.predid = predid
"""Identifier of the predicate."""
self.arguments = tuple(arguments)
"""A list of tuples (argloc, argid), specifying the location
and identifier for each of the predicate's argument in the
containing sentence. Argument identifiers are strings such as
``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
the predicate."""
self.parse_corpus = parse_corpus
"""A corpus reader for the parse trees corresponding to the
instances in this nombank corpus."""
@property
def roleset(self):
"""The name of the roleset used by this instance's predicate.
Use ``nombank.roleset() <NombankCorpusReader.roleset>`` to
look up information about the roleset."""
r = self.baseform.replace('%', 'perc-sign')
r = r.replace('1/10', '1-slash-10').replace('1-slash-10', 'oneslashonezero')
return '%s.%s' % (r, self.sensenumber)
def __repr__(self):
return '<NombankInstance: %s, sent %s, word %s>' % (
self.fileid,
self.sentnum,
self.wordnum,
)
def __str__(self):
s = '%s %s %s %s %s' % (
self.fileid,
self.sentnum,
self.wordnum,
self.baseform,
self.sensenumber,
)
items = self.arguments + ((self.predicate, 'rel'),)
for (argloc, argid) in sorted(items):
s += ' %s-%s' % (argloc, argid)
return s
def _get_tree(self):
if self.parse_corpus is None:
return None
if self.fileid not in self.parse_corpus.fileids():
return None
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
tree = property(
_get_tree,
doc="""
The parse tree corresponding to this instance, or None if
the corresponding tree is not available.""",
)
@staticmethod
def parse(s, parse_fileid_xform=None, parse_corpus=None):
pieces = s.split()
if len(pieces) < 6:
raise ValueError('Badly formatted nombank line: %r' % s)
# Divide the line into its basic pieces.
(fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5]
args = pieces[5:]
rel = [args.pop(i) for i, p in enumerate(args) if '-rel' in p]
if len(rel) != 1:
raise ValueError('Badly formatted nombank line: %r' % s)
# Apply the fileid selector, if any.
if parse_fileid_xform is not None:
fileid = parse_fileid_xform(fileid)
# Convert sentence & word numbers to ints.
sentnum = int(sentnum)
wordnum = int(wordnum)
# Parse the predicate location.
predloc, predid = rel[0].split('-', 1)
predicate = NombankTreePointer.parse(predloc)
# Parse the arguments.
arguments = []
for arg in args:
argloc, argid = arg.split('-', 1)
arguments.append((NombankTreePointer.parse(argloc), argid))
# Put it all together.
return NombankInstance(
fileid,
sentnum,
wordnum,
baseform,
sensenumber,
predicate,
predid,
arguments,
parse_corpus,
)
class NombankPointer(object):
"""
A pointer used by nombank to identify one or more constituents in
a parse tree. ``NombankPointer`` is an abstract base class with
three concrete subclasses:
- ``NombankTreePointer`` is used to point to single constituents.
- ``NombankSplitTreePointer`` is used to point to 'split'
constituents, which consist of a sequence of two or more
``NombankTreePointer`` pointers.
- ``NombankChainTreePointer`` is used to point to entire trace
chains in a tree. It consists of a sequence of pieces, which
can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers.
"""
def __init__(self):
if self.__class__ == NombankPointer:
raise NotImplementedError()
@python_2_unicode_compatible
class NombankChainTreePointer(NombankPointer):
def __init__(self, pieces):
self.pieces = pieces
"""A list of the pieces that make up this chain. Elements may
be either ``NombankSplitTreePointer`` or
``NombankTreePointer`` pointers."""
def __str__(self):
return '*'.join('%s' % p for p in self.pieces)
def __repr__(self):
return '<NombankChainTreePointer: %s>' % self
def select(self, tree):
if tree is None:
raise ValueError('Parse tree not avaialable')
return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
@python_2_unicode_compatible
class NombankSplitTreePointer(NombankPointer):
def __init__(self, pieces):
self.pieces = pieces
"""A list of the pieces that make up this chain. Elements are
all ``NombankTreePointer`` pointers."""
def __str__(self):
return ','.join('%s' % p for p in self.pieces)
def __repr__(self):
return '<NombankSplitTreePointer: %s>' % self
def select(self, tree):
if tree is None:
raise ValueError('Parse tree not avaialable')
return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
@total_ordering
@python_2_unicode_compatible
class NombankTreePointer(NombankPointer):
"""
wordnum:height*wordnum:height*...
wordnum:height,
"""
def __init__(self, wordnum, height):
self.wordnum = wordnum
self.height = height
@staticmethod
def parse(s):
# Deal with chains (xx*yy*zz)
pieces = s.split('*')
if len(pieces) > 1:
return NombankChainTreePointer(
[NombankTreePointer.parse(elt) for elt in pieces]
)
# Deal with split args (xx,yy,zz)
pieces = s.split(',')
if len(pieces) > 1:
return NombankSplitTreePointer(
[NombankTreePointer.parse(elt) for elt in pieces]
)
# Deal with normal pointers.
pieces = s.split(':')
if len(pieces) != 2:
raise ValueError('bad nombank pointer %r' % s)
return NombankTreePointer(int(pieces[0]), int(pieces[1]))
def __str__(self):
return '%s:%s' % (self.wordnum, self.height)
def __repr__(self):
return 'NombankTreePointer(%d, %d)' % (self.wordnum, self.height)
def __eq__(self, other):
while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, NombankTreePointer):
return self is other
return self.wordnum == other.wordnum and self.height == other.height
def __ne__(self, other):
return not self == other
def __lt__(self, other):
while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, NombankTreePointer):
return id(self) < id(other)
return (self.wordnum, -self.height) < (other.wordnum, -other.height)
def select(self, tree):
if tree is None:
raise ValueError('Parse tree not avaialable')
return tree[self.treepos(tree)]
def treepos(self, tree):
"""
Convert this pointer to a standard 'tree position' pointer,
given that it points to the given tree.
"""
if tree is None:
raise ValueError('Parse tree not avaialable')
stack = [tree]
treepos = []
wordnum = 0
while True:
# print treepos
# print stack[-1]
# tree node:
if isinstance(stack[-1], Tree):
# Select the next child.
if len(treepos) < len(stack):
treepos.append(0)
else:
treepos[-1] += 1
# Update the stack.
if treepos[-1] < len(stack[-1]):
stack.append(stack[-1][treepos[-1]])
else:
# End of node's child list: pop up a level.
stack.pop()
treepos.pop()
# word node:
else:
if wordnum == self.wordnum:
return tuple(treepos[: len(treepos) - self.height - 1])
else:
wordnum += 1
stack.pop()

View File

@@ -0,0 +1,92 @@
# Natural Language Toolkit: NPS Chat Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import unicode_literals
import re
import textwrap
from nltk.util import LazyConcatenation
from nltk.internals import ElementWrapper
from nltk.tag import map_tag
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import *
class NPSChatCorpusReader(XMLCorpusReader):
def __init__(self, root, fileids, wrap_etree=False, tagset=None):
XMLCorpusReader.__init__(self, root, fileids, wrap_etree)
self._tagset = tagset
def xml_posts(self, fileids=None):
if self._wrap_etree:
return concat(
[
XMLCorpusView(fileid, 'Session/Posts/Post', self._wrap_elt)
for fileid in self.abspaths(fileids)
]
)
else:
return concat(
[
XMLCorpusView(fileid, 'Session/Posts/Post')
for fileid in self.abspaths(fileids)
]
)
def posts(self, fileids=None):
return concat(
[
XMLCorpusView(
fileid, 'Session/Posts/Post/terminals', self._elt_to_words
)
for fileid in self.abspaths(fileids)
]
)
def tagged_posts(self, fileids=None, tagset=None):
def reader(elt, handler):
return self._elt_to_tagged_words(elt, handler, tagset)
return concat(
[
XMLCorpusView(fileid, 'Session/Posts/Post/terminals', reader)
for fileid in self.abspaths(fileids)
]
)
def words(self, fileids=None):
return LazyConcatenation(self.posts(fileids))
def tagged_words(self, fileids=None, tagset=None):
return LazyConcatenation(self.tagged_posts(fileids, tagset))
def _wrap_elt(self, elt, handler):
return ElementWrapper(elt)
def _elt_to_words(self, elt, handler):
return [self._simplify_username(t.attrib['word']) for t in elt.findall('t')]
def _elt_to_tagged_words(self, elt, handler, tagset=None):
tagged_post = [
(self._simplify_username(t.attrib['word']), t.attrib['pos'])
for t in elt.findall('t')
]
if tagset and tagset != self._tagset:
tagged_post = [
(w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post
]
return tagged_post
@staticmethod
def _simplify_username(word):
if 'User' in word:
word = 'U' + word.split('User', 1)[1]
elif isinstance(word, bytes):
word = word.decode('ascii')
return word

View File

@@ -0,0 +1,123 @@
# Natural Language Toolkit: Opinion Lexicon Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for the Opinion Lexicon.
- Opinion Lexicon information -
Authors: Minqing Hu and Bing Liu, 2004.
Department of Computer Sicence
University of Illinois at Chicago
Contact: Bing Liu, liub@cs.uic.edu
http://www.cs.uic.edu/~liub
Distributed with permission.
Related papers:
- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery
& Data Mining (KDD-04), Aug 22-25, 2004, Seattle, Washington, USA.
- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and
Comparing Opinions on the Web". Proceedings of the 14th International World
Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan.
"""
from six import string_types
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus.reader.api import *
class IgnoreReadmeCorpusView(StreamBackedCorpusView):
"""
This CorpusView is used to skip the initial readme block of the corpus.
"""
def __init__(self, *args, **kwargs):
StreamBackedCorpusView.__init__(self, *args, **kwargs)
# open self._stream
self._open()
# skip the readme block
read_blankline_block(self._stream)
# Set the initial position to the current stream position
self._filepos = [self._stream.tell()]
class OpinionLexiconCorpusReader(WordListCorpusReader):
"""
Reader for Liu and Hu opinion lexicon. Blank lines and readme are ignored.
>>> from nltk.corpus import opinion_lexicon
>>> opinion_lexicon.words()
['2-faced', '2-faces', 'abnormal', 'abolish', ...]
The OpinionLexiconCorpusReader provides shortcuts to retrieve positive/negative
words:
>>> opinion_lexicon.negative()
['2-faced', '2-faces', 'abnormal', 'abolish', ...]
Note that words from `words()` method are sorted by file id, not alphabetically:
>>> opinion_lexicon.words()[0:10]
['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably',
'abominate', 'abomination', 'abort', 'aborted']
>>> sorted(opinion_lexicon.words())[0:10]
['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably',
'abominate', 'abomination', 'abort']
"""
CorpusView = IgnoreReadmeCorpusView
def words(self, fileids=None):
"""
Return all words in the opinion lexicon. Note that these words are not
sorted in alphabetical order.
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def positive(self):
"""
Return all positive words in alphabetical order.
:return: a list of positive words.
:rtype: list(str)
"""
return self.words('positive-words.txt')
def negative(self):
"""
Return all negative words in alphabetical order.
:return: a list of negative words.
:rtype: list(str)
"""
return self.words('negative-words.txt')
def _read_word_block(self, stream):
words = []
for i in range(20): # Read 20 lines at a time.
line = stream.readline()
if not line:
continue
words.append(line.strip())
return words

View File

@@ -0,0 +1,174 @@
# Natural Language Toolkit: PanLex Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: David Kamholz <kamholz@panlex.org>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for PanLex Lite, a stripped down version of PanLex distributed
as an SQLite database. See the README.txt in the panlex_lite corpus directory
for more information on PanLex Lite.
"""
import os
import sqlite3
from nltk.corpus.reader.api import CorpusReader
class PanLexLiteCorpusReader(CorpusReader):
MEANING_Q = """
SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv
FROM dnx
JOIN ex ON (ex.ex = dnx.ex)
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ?
ORDER BY dnx2.uq DESC
"""
TRANSLATION_Q = """
SELECT s.tt, sum(s.uq) AS trq FROM (
SELECT ex2.tt, max(dnx.uq) AS uq
FROM dnx
JOIN ex ON (ex.ex = dnx.ex)
JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
JOIN ex ex2 ON (ex2.ex = dnx2.ex)
WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ?
GROUP BY ex2.tt, dnx.ui
) s
GROUP BY s.tt
ORDER BY trq DESC, s.tt
"""
def __init__(self, root):
self._c = sqlite3.connect(os.path.join(root, 'db.sqlite')).cursor()
self._uid_lv = {}
self._lv_uid = {}
for row in self._c.execute('SELECT uid, lv FROM lv'):
self._uid_lv[row[0]] = row[1]
self._lv_uid[row[1]] = row[0]
def language_varieties(self, lc=None):
"""
Return a list of PanLex language varieties.
:param lc: ISO 639 alpha-3 code. If specified, filters returned varieties
by this code. If unspecified, all varieties are returned.
:return: the specified language varieties as a list of tuples. The first
element is the language variety's seven-character uniform identifier,
and the second element is its default name.
:rtype: list(tuple)
"""
if lc is None:
return self._c.execute('SELECT uid, tt FROM lv ORDER BY uid').fetchall()
else:
return self._c.execute(
'SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid', (lc,)
).fetchall()
def meanings(self, expr_uid, expr_tt):
"""
Return a list of meanings for an expression.
:param expr_uid: the expression's language variety, as a seven-character
uniform identifier.
:param expr_tt: the expression's text.
:return: a list of Meaning objects.
:rtype: list(Meaning)
"""
expr_lv = self._uid_lv[expr_uid]
mn_info = {}
for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)):
mn = i[0]
uid = self._lv_uid[i[5]]
if not mn in mn_info:
mn_info[mn] = {
'uq': i[1],
'ap': i[2],
'ui': i[3],
'ex': {expr_uid: [expr_tt]},
}
if not uid in mn_info[mn]['ex']:
mn_info[mn]['ex'][uid] = []
mn_info[mn]['ex'][uid].append(i[4])
return [Meaning(mn, mn_info[mn]) for mn in mn_info]
def translations(self, from_uid, from_tt, to_uid):
"""
Return a list of translations for an expression into a single language
variety.
:param from_uid: the source expression's language variety, as a
seven-character uniform identifier.
:param from_tt: the source expression's text.
:param to_uid: the target language variety, as a seven-character
uniform identifier.
:return a list of translation tuples. The first element is the expression
text and the second element is the translation quality.
:rtype: list(tuple)
"""
from_lv = self._uid_lv[from_uid]
to_lv = self._uid_lv[to_uid]
return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall()
class Meaning(dict):
"""
Represents a single PanLex meaning. A meaning is a translation set derived
from a single source.
"""
def __init__(self, mn, attr):
super(Meaning, self).__init__(**attr)
self['mn'] = mn
def id(self):
"""
:return: the meaning's id.
:rtype: int
"""
return self['mn']
def quality(self):
"""
:return: the meaning's source's quality (0=worst, 9=best).
:rtype: int
"""
return self['uq']
def source(self):
"""
:return: the meaning's source id.
:rtype: int
"""
return self['ap']
def source_group(self):
"""
:return: the meaning's source group id.
:rtype: int
"""
return self['ui']
def expressions(self):
"""
:return: the meaning's expressions as a dictionary whose keys are language
variety uniform identifiers and whose values are lists of expression
texts.
:rtype: dict
"""
return self['ex']

View File

@@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Word List Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import print_function
from collections import namedtuple, defaultdict
import re
from six import string_types
from nltk.tokenize import line_tokenize
from nltk.corpus.reader.wordlist import WordListCorpusReader
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
PanlexLanguage = namedtuple('PanlexLanguage',
['panlex_uid', # (1) PanLex UID
'iso639', # (2) ISO 639 language code
'iso639_type', # (3) ISO 639 language type, see README
'script', # (4) normal scripts of expressions
'name', # (5) PanLex default name
'langvar_uid' # (6) UID of the language variety in which the default name is an expression
])
class PanlexSwadeshCorpusReader(WordListCorpusReader):
"""
This is a class to read the PanLex Swadesh list from
David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
PanLex: Building a Resource for Panlingual Lexical Translation.
In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
License: CC0 1.0 Universal
https://creativecommons.org/publicdomain/zero/1.0/legalcode
"""
def __init__(self, *args, **kwargs):
super(PanlexSwadeshCorpusReader, self).__init__(*args, **kwargs)
# Find the swadesh size using the fileids' path.
self.swadesh_size = re.match(r'swadesh([0-9].*)\/', self.fileids()[0]).group(1)
self._languages = {lang.panlex_uid:lang for lang in self.get_languages()}
self._macro_langauges = self.get_macrolanguages()
def license(self):
print('CC0 1.0 Universal')
def readme(self):
print(self.raw('README'))
def language_codes(self):
return self._languages.keys()
def get_languages(self):
for line in self.raw('langs{}.txt'.format(self.swadesh_size)).split('\n'):
if not line.strip(): # Skip empty lines.
continue
yield PanlexLanguage(*line.strip().split('\t'))
def get_macrolanguages(self):
macro_langauges = defaultdict(list)
for lang in self._languages.values():
macro_langauges[lang.iso639].append(lang.panlex_uid)
return macro_langauges
def words_by_lang(self, lang_code):
"""
:return: a list of list(str)
"""
fileid = 'swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
return [concept.split('\t') for concept in self.words(fileid)]
def words_by_iso639(self, iso63_code):
"""
:return: a list of list(str)
"""
fileids = ['swadesh{}/{}.txt'.format(self.swadesh_size, lang_code)
for lang_code in self._macro_langauges[iso63_code]]
return [concept.split('\t') for fileid in fileids for concept in self.words(fileid)]
def entries(self, fileids=None):
"""
:return: a tuple of words for the specified fileids.
"""
if not fileids:
fileids = self.fileids()
wordlists = [self.words(f) for f in fileids]
return list(zip(*wordlists))

View File

@@ -0,0 +1,383 @@
# Natural Language Toolkit:
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Piotr Kasprzyk <p.j.kasprzyk@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from six import string_types
from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import XMLCorpusReader
PARA = re.compile(r'<p(?: [^>]*){0,1}>(.*?)</p>')
SENT = re.compile(r'<s(?: [^>]*){0,1}>(.*?)</s>')
TAGGEDWORD = re.compile(r'<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>')
WORD = re.compile(r'<[wc](?: [^>]*){0,1}>(.*?)</[wc]>')
TYPE = re.compile(r'type="(.*?)"')
ANA = re.compile(r'ana="(.*?)"')
TEXTID = re.compile(r'text id="(.*?)"')
class TEICorpusView(StreamBackedCorpusView):
def __init__(
self,
corpus_file,
tagged,
group_by_sent,
group_by_para,
tagset=None,
head_len=0,
textids=None,
):
self._tagged = tagged
self._textids = textids
self._group_by_sent = group_by_sent
self._group_by_para = group_by_para
# WARNING -- skip header
StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len)
_pagesize = 4096
def read_block(self, stream):
block = stream.readlines(self._pagesize)
block = concat(block)
while (block.count('<text id') > block.count('</text>')) or block.count(
'<text id'
) == 0:
tmp = stream.readline()
if len(tmp) <= 0:
break
block += tmp
block = block.replace('\n', '')
textids = TEXTID.findall(block)
if self._textids:
for tid in textids:
if tid not in self._textids:
beg = block.find(tid) - 1
end = block[beg:].find('</text>') + len('</text>')
block = block[:beg] + block[beg + end :]
output = []
for para_str in PARA.findall(block):
para = []
for sent_str in SENT.findall(para_str):
if not self._tagged:
sent = WORD.findall(sent_str)
else:
sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str)))
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
if self._group_by_para:
output.append(para)
else:
output.extend(para)
return output
def _parse_tag(self, tag_word_tuple):
(tag, word) = tag_word_tuple
if tag.startswith('w'):
tag = ANA.search(tag).group(1)
else: # tag.startswith('c')
tag = TYPE.search(tag).group(1)
return word, tag
class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader):
head_len = 2770
def __init__(self, *args, **kwargs):
if 'textid_file' in kwargs:
self._textids = kwargs['textid_file']
else:
self._textids = None
XMLCorpusReader.__init__(self, *args)
CategorizedCorpusReader.__init__(self, kwargs)
self._init_textids()
def _init_textids(self):
self._f2t = defaultdict(list)
self._t2f = defaultdict(list)
if self._textids is not None:
with open(self._textids) as fp:
for line in fp:
line = line.strip()
file_id, text_ids = line.split(' ', 1)
if file_id not in self.fileids():
raise ValueError(
'In text_id mapping file %s: %s not found'
% (self._textids, file_id)
)
for text_id in text_ids.split(self._delimiter):
self._add_textids(file_id, text_id)
def _add_textids(self, file_id, text_id):
self._f2t[file_id].append(text_id)
self._t2f[text_id].append(file_id)
def _resolve(self, fileids, categories, textids=None):
tmp = None
if (
len(
filter(
lambda accessor: accessor is None, (fileids, categories, textids)
)
)
!= 1
):
raise ValueError(
'Specify exactly one of: fileids, ' 'categories or textids'
)
if fileids is not None:
return fileids, None
if categories is not None:
return self.fileids(categories), None
if textids is not None:
if isinstance(textids, string_types):
textids = [textids]
files = sum((self._t2f[t] for t in textids), [])
tdict = dict()
for f in files:
tdict[f] = set(self._f2t[f]) & set(textids)
return files, tdict
def decode_tag(self, tag):
# to be implemented
return tag
def textids(self, fileids=None, categories=None):
"""
In the pl196x corpus each category is stored in single
file and thus both methods provide identical functionality. In order
to accommodate finer granularity, a non-standard textids() method was
implemented. All the main functions can be supplied with a list
of required chunks---giving much more control to the user.
"""
fileids, _ = self._resolve(fileids, categories)
if fileids is None:
return sorted(self._t2f)
if isinstance(fileids, string_types):
fileids = [fileids]
return sorted(sum((self._f2t[d] for d in fileids), []))
def words(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
False,
False,
False,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid),
False,
False,
False,
head_len=self.head_len,
)
for fileid in fileids
]
)
def sents(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
False,
True,
False,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid), False, True, False, head_len=self.head_len
)
for fileid in fileids
]
)
def paras(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
False,
True,
True,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid), False, True, True, head_len=self.head_len
)
for fileid in fileids
]
)
def tagged_words(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
True,
False,
False,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid), True, False, False, head_len=self.head_len
)
for fileid in fileids
]
)
def tagged_sents(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
True,
True,
False,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid), True, True, False, head_len=self.head_len
)
for fileid in fileids
]
)
def tagged_paras(self, fileids=None, categories=None, textids=None):
fileids, textids = self._resolve(fileids, categories, textids)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
if textids:
return concat(
[
TEICorpusView(
self.abspath(fileid),
True,
True,
True,
head_len=self.head_len,
textids=textids[fileid],
)
for fileid in fileids
]
)
else:
return concat(
[
TEICorpusView(
self.abspath(fileid), True, True, True, head_len=self.head_len
)
for fileid in fileids
]
)
def xml(self, fileids=None, categories=None):
fileids, _ = self._resolve(fileids, categories)
if len(fileids) == 1:
return XMLCorpusReader.xml(self, fileids[0])
else:
raise TypeError('Expected a single file')
def raw(self, fileids=None, categories=None):
fileids, _ = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])

View File

@@ -0,0 +1,263 @@
# Natural Language Toolkit: Plaintext Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# Nitin Madnani <nmadnani@umiacs.umd.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
A reader for corpora that consist of plaintext documents.
"""
import nltk.data
from nltk.tokenize import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
class PlaintextCorpusReader(CorpusReader):
"""
Reader for corpora that consist of plaintext documents. Paragraphs
are assumed to be split using blank lines. Sentences and words can
be tokenized using the default tokenizers, or by custom tokenizers
specificed as parameters to the constructor.
This corpus reader can be customized (e.g., to skip preface
sections of specific document formats) by creating a subclass and
overriding the ``CorpusView`` class variable.
"""
CorpusView = StreamBackedCorpusView
"""The corpus view class used by this reader. Subclasses of
``PlaintextCorpusReader`` may specify alternative corpus view
classes (e.g., to skip the preface sections of documents.)"""
def __init__(
self,
root,
fileids,
word_tokenizer=WordPunctTokenizer(),
sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/english.pickle'),
para_block_reader=read_blankline_block,
encoding='utf8',
):
"""
Construct a new plaintext corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/usr/local/share/nltk_data/corpora/webtext/'
>>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param word_tokenizer: Tokenizer for breaking sentences or
paragraphs into words.
:param sent_tokenizer: Tokenizer for breaking paragraphs
into words.
:param para_block_reader: The block reader used to divide the
corpus into paragraph blocks.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._para_block_reader = para_block_reader
def raw(self, fileids=None):
"""
:return: the given file(s) as a single string.
:rtype: str
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
raw_texts = []
for f in fileids:
_fin = self.open(f)
raw_texts.append(_fin.read())
_fin.close()
return concat(raw_texts)
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def sents(self, fileids=None):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
"""
if self._sent_tokenizer is None:
raise ValueError('No sentence tokenizer for this corpus')
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def paras(self, fileids=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
"""
if self._sent_tokenizer is None:
raise ValueError('No sentence tokenizer for this corpus')
return concat(
[
self.CorpusView(path, self._read_para_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_word_block(self, stream):
words = []
for i in range(20): # Read 20 lines at a time.
words.extend(self._word_tokenizer.tokenize(stream.readline()))
return words
def _read_sent_block(self, stream):
sents = []
for para in self._para_block_reader(stream):
sents.extend(
[
self._word_tokenizer.tokenize(sent)
for sent in self._sent_tokenizer.tokenize(para)
]
)
return sents
def _read_para_block(self, stream):
paras = []
for para in self._para_block_reader(stream):
paras.append(
[
self._word_tokenizer.tokenize(sent)
for sent in self._sent_tokenizer.tokenize(para)
]
)
return paras
class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader):
"""
A reader for plaintext corpora whose documents are divided into
categories based on their file identifiers.
"""
def __init__(self, *args, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining arguments
are passed to the ``PlaintextCorpusReader`` constructor.
"""
CategorizedCorpusReader.__init__(self, kwargs)
PlaintextCorpusReader.__init__(self, *args, **kwargs)
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
raise ValueError('Specify fileids or categories, not both')
if categories is not None:
return self.fileids(categories)
else:
return fileids
def raw(self, fileids=None, categories=None):
return PlaintextCorpusReader.raw(self, self._resolve(fileids, categories))
def words(self, fileids=None, categories=None):
return PlaintextCorpusReader.words(self, self._resolve(fileids, categories))
def sents(self, fileids=None, categories=None):
return PlaintextCorpusReader.sents(self, self._resolve(fileids, categories))
def paras(self, fileids=None, categories=None):
return PlaintextCorpusReader.paras(self, self._resolve(fileids, categories))
# FIXME: Is there a better way? How to not hardcode this?
# Possibly, add a language kwargs to CategorizedPlaintextCorpusReader to
# override the `sent_tokenizer`.
class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
def __init__(self, *args, **kwargs):
CategorizedCorpusReader.__init__(self, kwargs)
kwargs['sent_tokenizer'] = nltk.data.LazyLoader(
'tokenizers/punkt/portuguese.pickle'
)
PlaintextCorpusReader.__init__(self, *args, **kwargs)
class EuroparlCorpusReader(PlaintextCorpusReader):
"""
Reader for Europarl corpora that consist of plaintext documents.
Documents are divided into chapters instead of paragraphs as
for regular plaintext documents. Chapters are separated using blank
lines. Everything is inherited from ``PlaintextCorpusReader`` except
that:
- Since the corpus is pre-processed and pre-tokenized, the
word tokenizer should just split the line at whitespaces.
- For the same reason, the sentence tokenizer should just
split the paragraph at line breaks.
- There is a new 'chapters()' method that returns chapters instead
instead of paragraphs.
- The 'paras()' method inherited from PlaintextCorpusReader is
made non-functional to remove any confusion between chapters
and paragraphs for Europarl.
"""
def _read_word_block(self, stream):
words = []
for i in range(20): # Read 20 lines at a time.
words.extend(stream.readline().split())
return words
def _read_sent_block(self, stream):
sents = []
for para in self._para_block_reader(stream):
sents.extend([sent.split() for sent in para.splitlines()])
return sents
def _read_para_block(self, stream):
paras = []
for para in self._para_block_reader(stream):
paras.append([sent.split() for sent in para.splitlines()])
return paras
def chapters(self, fileids=None):
"""
:return: the given file(s) as a list of
chapters, each encoded as a list of sentences, which are
in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
"""
return concat(
[
self.CorpusView(fileid, self._read_para_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def paras(self, fileids=None):
raise NotImplementedError(
'The Europarl corpus reader does not support paragraphs. Please use chapters() instead.'
)

View File

@@ -0,0 +1,107 @@
# Natural Language Toolkit: PP Attachment Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Read lines from the Prepositional Phrase Attachment Corpus.
The PP Attachment Corpus contains several files having the format:
sentence_id verb noun1 preposition noun2 attachment
For example:
42960 gives authority to administration V
46742 gives inventors of microchip N
The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.:
(VP gives (NP authority) (PP to administration))
(VP gives (NP inventors (PP of microchip)))
The corpus contains the following files:
training: training set
devset: development test set, used for algorithm development.
test: test set, used to report results
bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal.
Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional
Phrase Attachment. Proceedings of the ARPA Human Language Technology
Conference. [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps]
The PP Attachment Corpus is distributed with NLTK with the permission
of the author.
"""
from __future__ import unicode_literals
from six import string_types
from nltk import compat
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
@compat.python_2_unicode_compatible
class PPAttachment(object):
def __init__(self, sent, verb, noun1, prep, noun2, attachment):
self.sent = sent
self.verb = verb
self.noun1 = noun1
self.prep = prep
self.noun2 = noun2
self.attachment = attachment
def __repr__(self):
return (
'PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, '
'noun2=%r, attachment=%r)'
% (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment)
)
class PPAttachmentCorpusReader(CorpusReader):
"""
sentence_id verb noun1 preposition noun2 attachment
"""
def attachments(self, fileids):
return concat(
[
StreamBackedCorpusView(fileid, self._read_obj_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tuples(self, fileids):
return concat(
[
StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def raw(self, fileids=None):
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def _read_tuple_block(self, stream):
line = stream.readline()
if line:
return [tuple(line.split())]
else:
return []
def _read_obj_block(self, stream):
line = stream.readline()
if line:
return [PPAttachment(*line.split())]
else:
return []

View File

@@ -0,0 +1,539 @@
# Natural Language Toolkit: PropBank Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import unicode_literals
import re
from functools import total_ordering
from xml.etree import ElementTree
from six import string_types
from nltk.tree import Tree
from nltk.internals import raise_unorderable_types
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
class PropbankCorpusReader(CorpusReader):
"""
Corpus reader for the propbank corpus, which augments the Penn
Treebank with information about the predicate argument structure
of every verb instance. The corpus consists of two parts: the
predicate-argument annotations themselves, and a set of "frameset
files" which define the argument labels used by the annotations,
on a per-verb basis. Each "frameset file" contains one or more
predicates, such as ``'turn'`` or ``'turn_on'``, each of which is
divided into coarse-grained word senses called "rolesets". For
each "roleset", the frameset file provides descriptions of the
argument roles, along with examples.
"""
def __init__(
self,
root,
propfile,
framefiles='',
verbsfile=None,
parse_fileid_xform=None,
parse_corpus=None,
encoding='utf8',
):
"""
:param root: The root directory for this corpus.
:param propfile: The name of the file containing the predicate-
argument annotations (relative to ``root``).
:param framefiles: A list or regexp specifying the frameset
fileids for this corpus.
:param parse_fileid_xform: A transform that should be applied
to the fileids in this corpus. This should be a function
of one argument (a fileid) that returns a string (the new
fileid).
:param parse_corpus: The corpus containing the parse trees
corresponding to this corpus. These parse trees are
necessary to resolve the tree pointers used by propbank.
"""
# If framefiles is specified as a regexp, expand it.
if isinstance(framefiles, string_types):
framefiles = find_corpus_fileids(root, framefiles)
framefiles = list(framefiles)
# Initialze the corpus reader.
CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding)
# Record our frame fileids & prop file.
self._propfile = propfile
self._framefiles = framefiles
self._verbsfile = verbsfile
self._parse_fileid_xform = parse_fileid_xform
self._parse_corpus = parse_corpus
def raw(self, fileids=None):
"""
:return: the text contents of the given fileids, as a single string.
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def instances(self, baseform=None):
"""
:return: a corpus view that acts as a list of
``PropBankInstance`` objects, one for each noun in the corpus.
"""
kwargs = {}
if baseform is not None:
kwargs['instance_filter'] = lambda inst: inst.baseform == baseform
return StreamBackedCorpusView(
self.abspath(self._propfile),
lambda stream: self._read_instance_block(stream, **kwargs),
encoding=self.encoding(self._propfile),
)
def lines(self):
"""
:return: a corpus view that acts as a list of strings, one for
each line in the predicate-argument annotation file.
"""
return StreamBackedCorpusView(
self.abspath(self._propfile),
read_line_block,
encoding=self.encoding(self._propfile),
)
def roleset(self, roleset_id):
"""
:return: the xml description for the given roleset.
"""
baseform = roleset_id.split('.')[0]
framefile = 'frames/%s.xml' % baseform
if framefile not in self._framefiles:
raise ValueError('Frameset file for %s not found' % roleset_id)
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
for roleset in etree.findall('predicate/roleset'):
if roleset.attrib['id'] == roleset_id:
return roleset
raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
def rolesets(self, baseform=None):
"""
:return: list of xml descriptions for rolesets.
"""
if baseform is not None:
framefile = 'frames/%s.xml' % baseform
if framefile not in self._framefiles:
raise ValueError('Frameset file for %s not found' % baseform)
framefiles = [framefile]
else:
framefiles = self._framefiles
rsets = []
for framefile in framefiles:
# n.b.: The encoding for XML fileids is specified by the file
# itself; so we ignore self._encoding here.
etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
rsets.append(etree.findall('predicate/roleset'))
return LazyConcatenation(rsets)
def verbs(self):
"""
:return: a corpus view that acts as a list of all verb lemmas
in this corpus (from the verbs.txt file).
"""
return StreamBackedCorpusView(
self.abspath(self._verbsfile),
read_line_block,
encoding=self.encoding(self._verbsfile),
)
def _read_instance_block(self, stream, instance_filter=lambda inst: True):
block = []
# Read 100 at a time.
for i in range(100):
line = stream.readline().strip()
if line:
inst = PropbankInstance.parse(
line, self._parse_fileid_xform, self._parse_corpus
)
if instance_filter(inst):
block.append(inst)
return block
######################################################################
# { Propbank Instance & related datatypes
######################################################################
@compat.python_2_unicode_compatible
class PropbankInstance(object):
def __init__(
self,
fileid,
sentnum,
wordnum,
tagger,
roleset,
inflection,
predicate,
arguments,
parse_corpus=None,
):
self.fileid = fileid
"""The name of the file containing the parse tree for this
instance's sentence."""
self.sentnum = sentnum
"""The sentence number of this sentence within ``fileid``.
Indexing starts from zero."""
self.wordnum = wordnum
"""The word number of this instance's predicate within its
containing sentence. Word numbers are indexed starting from
zero, and include traces and other empty parse elements."""
self.tagger = tagger
"""An identifier for the tagger who tagged this instance; or
``'gold'`` if this is an adjuticated instance."""
self.roleset = roleset
"""The name of the roleset used by this instance's predicate.
Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to
look up information about the roleset."""
self.inflection = inflection
"""A ``PropbankInflection`` object describing the inflection of
this instance's predicate."""
self.predicate = predicate
"""A ``PropbankTreePointer`` indicating the position of this
instance's predicate within its containing sentence."""
self.arguments = tuple(arguments)
"""A list of tuples (argloc, argid), specifying the location
and identifier for each of the predicate's argument in the
containing sentence. Argument identifiers are strings such as
``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain
the predicate."""
self.parse_corpus = parse_corpus
"""A corpus reader for the parse trees corresponding to the
instances in this propbank corpus."""
@property
def baseform(self):
"""The baseform of the predicate."""
return self.roleset.split('.')[0]
@property
def sensenumber(self):
"""The sense number of the predicate."""
return self.roleset.split('.')[1]
@property
def predid(self):
"""Identifier of the predicate."""
return 'rel'
def __repr__(self):
return '<PropbankInstance: %s, sent %s, word %s>' % (
self.fileid,
self.sentnum,
self.wordnum,
)
def __str__(self):
s = '%s %s %s %s %s %s' % (
self.fileid,
self.sentnum,
self.wordnum,
self.tagger,
self.roleset,
self.inflection,
)
items = self.arguments + ((self.predicate, 'rel'),)
for (argloc, argid) in sorted(items):
s += ' %s-%s' % (argloc, argid)
return s
def _get_tree(self):
if self.parse_corpus is None:
return None
if self.fileid not in self.parse_corpus.fileids():
return None
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]
tree = property(
_get_tree,
doc="""
The parse tree corresponding to this instance, or None if
the corresponding tree is not available.""",
)
@staticmethod
def parse(s, parse_fileid_xform=None, parse_corpus=None):
pieces = s.split()
if len(pieces) < 7:
raise ValueError('Badly formatted propbank line: %r' % s)
# Divide the line into its basic pieces.
(fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6]
rel = [p for p in pieces[6:] if p.endswith('-rel')]
args = [p for p in pieces[6:] if not p.endswith('-rel')]
if len(rel) != 1:
raise ValueError('Badly formatted propbank line: %r' % s)
# Apply the fileid selector, if any.
if parse_fileid_xform is not None:
fileid = parse_fileid_xform(fileid)
# Convert sentence & word numbers to ints.
sentnum = int(sentnum)
wordnum = int(wordnum)
# Parse the inflection
inflection = PropbankInflection.parse(inflection)
# Parse the predicate location.
predicate = PropbankTreePointer.parse(rel[0][:-4])
# Parse the arguments.
arguments = []
for arg in args:
argloc, argid = arg.split('-', 1)
arguments.append((PropbankTreePointer.parse(argloc), argid))
# Put it all together.
return PropbankInstance(
fileid,
sentnum,
wordnum,
tagger,
roleset,
inflection,
predicate,
arguments,
parse_corpus,
)
class PropbankPointer(object):
"""
A pointer used by propbank to identify one or more constituents in
a parse tree. ``PropbankPointer`` is an abstract base class with
three concrete subclasses:
- ``PropbankTreePointer`` is used to point to single constituents.
- ``PropbankSplitTreePointer`` is used to point to 'split'
constituents, which consist of a sequence of two or more
``PropbankTreePointer`` pointers.
- ``PropbankChainTreePointer`` is used to point to entire trace
chains in a tree. It consists of a sequence of pieces, which
can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers.
"""
def __init__(self):
if self.__class__ == PropbankPointer:
raise NotImplementedError()
@compat.python_2_unicode_compatible
class PropbankChainTreePointer(PropbankPointer):
def __init__(self, pieces):
self.pieces = pieces
"""A list of the pieces that make up this chain. Elements may
be either ``PropbankSplitTreePointer`` or
``PropbankTreePointer`` pointers."""
def __str__(self):
return '*'.join('%s' % p for p in self.pieces)
def __repr__(self):
return '<PropbankChainTreePointer: %s>' % self
def select(self, tree):
if tree is None:
raise ValueError('Parse tree not avaialable')
return Tree('*CHAIN*', [p.select(tree) for p in self.pieces])
@compat.python_2_unicode_compatible
class PropbankSplitTreePointer(PropbankPointer):
def __init__(self, pieces):
self.pieces = pieces
"""A list of the pieces that make up this chain. Elements are
all ``PropbankTreePointer`` pointers."""
def __str__(self):
return ','.join('%s' % p for p in self.pieces)
def __repr__(self):
return '<PropbankSplitTreePointer: %s>' % self
def select(self, tree):
if tree is None:
raise ValueError('Parse tree not avaialable')
return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
@total_ordering
@compat.python_2_unicode_compatible
class PropbankTreePointer(PropbankPointer):
"""
wordnum:height*wordnum:height*...
wordnum:height,
"""
def __init__(self, wordnum, height):
self.wordnum = wordnum
self.height = height
@staticmethod
def parse(s):
# Deal with chains (xx*yy*zz)
pieces = s.split('*')
if len(pieces) > 1:
return PropbankChainTreePointer(
[PropbankTreePointer.parse(elt) for elt in pieces]
)
# Deal with split args (xx,yy,zz)
pieces = s.split(',')
if len(pieces) > 1:
return PropbankSplitTreePointer(
[PropbankTreePointer.parse(elt) for elt in pieces]
)
# Deal with normal pointers.
pieces = s.split(':')
if len(pieces) != 2:
raise ValueError('bad propbank pointer %r' % s)
return PropbankTreePointer(int(pieces[0]), int(pieces[1]))
def __str__(self):
return '%s:%s' % (self.wordnum, self.height)
def __repr__(self):
return 'PropbankTreePointer(%d, %d)' % (self.wordnum, self.height)
def __eq__(self, other):
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, PropbankTreePointer):
return self is other
return self.wordnum == other.wordnum and self.height == other.height
def __ne__(self, other):
return not self == other
def __lt__(self, other):
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)):
other = other.pieces[0]
if not isinstance(other, PropbankTreePointer):
return id(self) < id(other)
return (self.wordnum, -self.height) < (other.wordnum, -other.height)
def select(self, tree):
if tree is None:
raise ValueError('Parse tree not avaialable')
return tree[self.treepos(tree)]
def treepos(self, tree):
"""
Convert this pointer to a standard 'tree position' pointer,
given that it points to the given tree.
"""
if tree is None:
raise ValueError('Parse tree not avaialable')
stack = [tree]
treepos = []
wordnum = 0
while True:
# print treepos
# print stack[-1]
# tree node:
if isinstance(stack[-1], Tree):
# Select the next child.
if len(treepos) < len(stack):
treepos.append(0)
else:
treepos[-1] += 1
# Update the stack.
if treepos[-1] < len(stack[-1]):
stack.append(stack[-1][treepos[-1]])
else:
# End of node's child list: pop up a level.
stack.pop()
treepos.pop()
# word node:
else:
if wordnum == self.wordnum:
return tuple(treepos[: len(treepos) - self.height - 1])
else:
wordnum += 1
stack.pop()
@compat.python_2_unicode_compatible
class PropbankInflection(object):
# { Inflection Form
INFINITIVE = 'i'
GERUND = 'g'
PARTICIPLE = 'p'
FINITE = 'v'
# { Inflection Tense
FUTURE = 'f'
PAST = 'p'
PRESENT = 'n'
# { Inflection Aspect
PERFECT = 'p'
PROGRESSIVE = 'o'
PERFECT_AND_PROGRESSIVE = 'b'
# { Inflection Person
THIRD_PERSON = '3'
# { Inflection Voice
ACTIVE = 'a'
PASSIVE = 'p'
# { Inflection
NONE = '-'
# }
def __init__(self, form='-', tense='-', aspect='-', person='-', voice='-'):
self.form = form
self.tense = tense
self.aspect = aspect
self.person = person
self.voice = voice
def __str__(self):
return self.form + self.tense + self.aspect + self.person + self.voice
def __repr__(self):
return '<PropbankInflection: %s>' % self
_VALIDATE = re.compile(r'[igpv\-][fpn\-][pob\-][3\-][ap\-]$')
@staticmethod
def parse(s):
if not isinstance(s, string_types):
raise TypeError('expected a string')
if len(s) != 5 or not PropbankInflection._VALIDATE.match(s):
raise ValueError('Bad propbank inflection string %r' % s)
return PropbankInflection(*s)

View File

@@ -0,0 +1,143 @@
# Natural Language Toolkit: Pros and Cons Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for the Pros and Cons dataset.
- Pros and Cons dataset information -
Contact: Bing Liu, liub@cs.uic.edu
http://www.cs.uic.edu/~liub
Distributed with permission.
Related papers:
- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
Proceedings of the 22nd International Conference on Computational Linguistics
(Coling-2008), Manchester, 18-22 August, 2008.
- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
Opinions on the Web". Proceedings of the 14th international World Wide Web
conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
"""
import re
from six import string_types
from nltk.corpus.reader.api import *
from nltk.tokenize import *
class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
"""
Reader for the Pros and Cons sentence dataset.
>>> from nltk.corpus import pros_cons
>>> pros_cons.sents(categories='Cons')
[['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
...]
>>> pros_cons.words('IntegratedPros.txt')
['Easy', 'to', 'use', ',', 'economical', '!', ...]
"""
CorpusView = StreamBackedCorpusView
def __init__(
self,
root,
fileids,
word_tokenizer=WordPunctTokenizer(),
encoding='utf8',
**kwargs
):
"""
:param root: The root directory for the corpus.
:param fileids: a list or regexp specifying the fileids in the corpus.
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
into words. Default: `WhitespaceTokenizer`
:param encoding: the encoding that should be used to read the corpus.
:param kwargs: additional parameters passed to CategorizedCorpusReader.
"""
CorpusReader.__init__(self, root, fileids, encoding)
CategorizedCorpusReader.__init__(self, kwargs)
self._word_tokenizer = word_tokenizer
def sents(self, fileids=None, categories=None):
"""
Return all sentences in the corpus or in the specified files/categories.
:param fileids: a list or regexp specifying the ids of the files whose
sentences have to be returned.
:param categories: a list specifying the categories whose sentences
have to be returned.
:return: the given file(s) as a list of sentences. Each sentence is
tokenized using the specified word_tokenizer.
:rtype: list(list(str))
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def words(self, fileids=None, categories=None):
"""
Return all words and punctuation symbols in the corpus or in the specified
files/categories.
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:param categories: a list specifying the categories whose words have
to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_sent_block(self, stream):
sents = []
for i in range(20): # Read 20 lines at a time.
line = stream.readline()
if not line:
continue
sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>", line)
if sent:
sents.append(self._word_tokenizer.tokenize(sent.group(2).strip()))
return sents
def _read_word_block(self, stream):
words = []
for sent in self._read_sent_block(stream):
words.extend(sent)
return words
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
raise ValueError('Specify fileids or categories, not both')
if categories is not None:
return self.fileids(categories)
else:
return fileids

View File

@@ -0,0 +1,355 @@
# Natural Language Toolkit: Product Reviews Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for reviews corpora (syntax based on Customer Review Corpus).
- Customer Review Corpus information -
Annotated by: Minqing Hu and Bing Liu, 2004.
Department of Computer Sicence
University of Illinois at Chicago
Contact: Bing Liu, liub@cs.uic.edu
http://www.cs.uic.edu/~liub
Distributed with permission.
The "product_reviews_1" and "product_reviews_2" datasets respectively contain
annotated customer reviews of 5 and 9 products from amazon.com.
Related papers:
- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
Proceedings of the ACM SIGKDD International Conference on Knowledge
Discovery & Data Mining (KDD-04), 2004.
- Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews".
Proceedings of Nineteeth National Conference on Artificial Intelligence
(AAAI-2004), 2004.
- Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to
Opinion Mining." Proceedings of First ACM International Conference on Web
Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University,
Stanford, California, USA.
Symbols used in the annotated reviews:
[t] : the title of the review: Each [t] tag starts a review.
xxxx[+|-n]: xxxx is a product feature.
[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest.
Note that the strength is quite subjective.
You may want ignore it, but only considering + and -
[-n]: Negative opinion
## : start of each sentence. Each line is a sentence.
[u] : feature not appeared in the sentence.
[p] : feature not appeared in the sentence. Pronoun resolution is needed.
[s] : suggestion or recommendation.
[cc]: comparison with a competing product from a different brand.
[cs]: comparison with a competing product from the same brand.
Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
provide separation between different reviews. This is due to the fact that
the dataset was specifically designed for aspect/feature-based sentiment
analysis, for which sentence-level annotation is sufficient. For document-
level classification and analysis, this peculiarity should be taken into
consideration.
"""
from __future__ import division
import re
from six import string_types
from nltk.corpus.reader.api import *
from nltk.tokenize import *
TITLE = re.compile(r'^\[t\](.*)$') # [t] Title
FEATURES = re.compile(
r'((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]'
) # find 'feature' in feature[+3]
NOTES = re.compile(r'\[(?!t)(p|u|s|cc|cs)\]') # find 'p' in camera[+2][p]
SENT = re.compile(r'##(.*)$') # find tokenized sentence
@compat.python_2_unicode_compatible
class Review(object):
"""
A Review is the main block of a ReviewsCorpusReader.
"""
def __init__(self, title=None, review_lines=None):
"""
:param title: the title of the review.
:param review_lines: the list of the ReviewLines that belong to the Review.
"""
self.title = title
if review_lines is None:
self.review_lines = []
else:
self.review_lines = review_lines
def add_line(self, review_line):
"""
Add a line (ReviewLine) to the review.
:param review_line: a ReviewLine instance that belongs to the Review.
"""
assert isinstance(review_line, ReviewLine)
self.review_lines.append(review_line)
def features(self):
"""
Return a list of features in the review. Each feature is a tuple made of
the specific item feature and the opinion strength about that feature.
:return: all features of the review as a list of tuples (feat, score).
:rtype: list(tuple)
"""
features = []
for review_line in self.review_lines:
features.extend(review_line.features)
return features
def sents(self):
"""
Return all tokenized sentences in the review.
:return: all sentences of the review as lists of tokens.
:rtype: list(list(str))
"""
return [review_line.sent for review_line in self.review_lines]
def __repr__(self):
return 'Review(title=\"{}\", review_lines={})'.format(
self.title, self.review_lines
)
@compat.python_2_unicode_compatible
class ReviewLine(object):
"""
A ReviewLine represents a sentence of the review, together with (optional)
annotations of its features and notes about the reviewed item.
"""
def __init__(self, sent, features=None, notes=None):
self.sent = sent
if features is None:
self.features = []
else:
self.features = features
if notes is None:
self.notes = []
else:
self.notes = notes
def __repr__(self):
return 'ReviewLine(features={}, notes={}, sent={})'.format(
self.features, self.notes, self.sent
)
class ReviewsCorpusReader(CorpusReader):
"""
Reader for the Customer Review Data dataset by Hu, Liu (2004).
Note: we are not applying any sentence tokenization at the moment, just word
tokenization.
>>> from nltk.corpus import product_reviews_1
>>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
>>> review = camera_reviews[0]
>>> review.sents()[0]
['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
>>> review.features()
[('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
('option', '+1')]
We can also reach the same information directly from the stream:
>>> product_reviews_1.features('Canon_G3.txt')
[('canon powershot g3', '+3'), ('use', '+2'), ...]
We can compute stats for specific product features:
>>> from __future__ import division
>>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
>>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
>>> # We use float for backward compatibility with division in Python2.7
>>> mean = tot / n_reviews
>>> print(n_reviews, tot, mean)
15 24 1.6
"""
CorpusView = StreamBackedCorpusView
def __init__(
self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding='utf8'
):
"""
:param root: The root directory for the corpus.
:param fileids: a list or regexp specifying the fileids in the corpus.
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
into words. Default: `WordPunctTokenizer`
:param encoding: the encoding that should be used to read the corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._word_tokenizer = word_tokenizer
def features(self, fileids=None):
"""
Return a list of features. Each feature is a tuple made of the specific
item feature and the opinion strength about that feature.
:param fileids: a list or regexp specifying the ids of the files whose
features have to be returned.
:return: all features for the item(s) in the given file(s).
:rtype: list(tuple)
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat(
[
self.CorpusView(fileid, self._read_features, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def raw(self, fileids=None):
"""
:param fileids: a list or regexp specifying the fileids of the files that
have to be returned as a raw string.
:return: the given file(s) as a single string.
:rtype: str
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def readme(self):
"""
Return the contents of the corpus README.txt file.
"""
return self.open("README.txt").read()
def reviews(self, fileids=None):
"""
Return all the reviews as a list of Review objects. If `fileids` is
specified, return all the reviews from each of the specified files.
:param fileids: a list or regexp specifying the ids of the files whose
reviews have to be returned.
:return: the given file(s) as a list of reviews.
"""
if fileids is None:
fileids = self._fileids
return concat(
[
self.CorpusView(fileid, self._read_review_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
"""
Return all sentences in the corpus or in the specified files.
:param fileids: a list or regexp specifying the ids of the files whose
sentences have to be returned.
:return: the given file(s) as a list of sentences, each encoded as a
list of word strings.
:rtype: list(list(str))
"""
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def words(self, fileids=None):
"""
Return all words and punctuation symbols in the corpus or in the specified
files.
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_features(self, stream):
features = []
for i in range(20):
line = stream.readline()
if not line:
return features
features.extend(re.findall(FEATURES, line))
return features
def _read_review_block(self, stream):
while True:
line = stream.readline()
if not line:
return [] # end of file.
title_match = re.match(TITLE, line)
if title_match:
review = Review(
title=title_match.group(1).strip()
) # We create a new review
break
# Scan until we find another line matching the regexp, or EOF.
while True:
oldpos = stream.tell()
line = stream.readline()
# End of file:
if not line:
return [review]
# Start of a new review: backup to just before it starts, and
# return the review we've already collected.
if re.match(TITLE, line):
stream.seek(oldpos)
return [review]
# Anything else is part of the review line.
feats = re.findall(FEATURES, line)
notes = re.findall(NOTES, line)
sent = re.findall(SENT, line)
if sent:
sent = self._word_tokenizer.tokenize(sent[0])
review_line = ReviewLine(sent=sent, features=feats, notes=notes)
review.add_line(review_line)
def _read_sent_block(self, stream):
sents = []
for review in self._read_review_block(stream):
sents.extend([sent for sent in review.sents()])
return sents
def _read_word_block(self, stream):
words = []
for i in range(20): # Read 20 lines at a time.
line = stream.readline()
sent = re.findall(SENT, line)
if sent:
words.extend(self._word_tokenizer.tokenize(sent[0]))
return words

View File

@@ -0,0 +1,151 @@
# Natural Language Toolkit: RTE Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora.
The files were taken from the RTE1, RTE2 and RTE3 datasets and the files
were regularized.
Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the
gold standard annotated files.
Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following
example is taken from RTE3::
<pair id="1" entailment="YES" task="IE" length="short" >
<t>The sale was made to pay Yukos' US$ 27.5 billion tax bill,
Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known
company Baikalfinansgroup which was later bought by the Russian
state-owned oil company Rosneft .</t>
<h>Baikalfinansgroup was sold to Rosneft.</h>
</pair>
In order to provide globally unique IDs for each pair, a new attribute
``challenge`` has been added to the root element ``entailment-corpus`` of each
file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the
challenge number and 'n' is the pair ID.
"""
from __future__ import unicode_literals
from six import string_types
from nltk import compat
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import *
def norm(value_string):
"""
Normalize the string value in an RTE pair's ``value`` or ``entailment``
attribute as an integer (1, 0).
:param value_string: the label used to classify a text/hypothesis pair
:type value_string: str
:rtype: int
"""
valdict = {"TRUE": 1, "FALSE": 0, "YES": 1, "NO": 0}
return valdict[value_string.upper()]
@compat.python_2_unicode_compatible
class RTEPair(object):
"""
Container for RTE text-hypothesis pairs.
The entailment relation is signalled by the ``value`` attribute in RTE1, and by
``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment``
attribute of this class.
"""
def __init__(
self,
pair,
challenge=None,
id=None,
text=None,
hyp=None,
value=None,
task=None,
length=None,
):
"""
:param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3)
:param id: identifier for the pair
:param text: the text component of the pair
:param hyp: the hypothesis component of the pair
:param value: classification label for the pair
:param task: attribute for the particular NLP task that the data was drawn from
:param length: attribute for the length of the text of the pair
"""
self.challenge = challenge
self.id = pair.attrib["id"]
self.gid = "%s-%s" % (self.challenge, self.id)
self.text = pair[0].text
self.hyp = pair[1].text
if "value" in pair.attrib:
self.value = norm(pair.attrib["value"])
elif "entailment" in pair.attrib:
self.value = norm(pair.attrib["entailment"])
else:
self.value = value
if "task" in pair.attrib:
self.task = pair.attrib["task"]
else:
self.task = task
if "length" in pair.attrib:
self.length = pair.attrib["length"]
else:
self.length = length
def __repr__(self):
if self.challenge:
return '<RTEPair: gid=%s-%s>' % (self.challenge, self.id)
else:
return '<RTEPair: id=%s>' % self.id
class RTECorpusReader(XMLCorpusReader):
"""
Corpus reader for corpora in RTE challenges.
This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected
structure of input documents.
"""
def _read_etree(self, doc):
"""
Map the XML input into an RTEPair.
This uses the ``getiterator()`` method from the ElementTree package to
find all the ``<pair>`` elements.
:param doc: a parsed XML document
:rtype: list(RTEPair)
"""
try:
challenge = doc.attrib['challenge']
except KeyError:
challenge = None
return [RTEPair(pair, challenge=challenge) for pair in doc.getiterator("pair")]
def pairs(self, fileids):
"""
Build a list of RTEPairs from a RTE corpus.
:param fileids: a list of RTE corpus fileids
:type: list
:rtype: list(RTEPair)
"""
if isinstance(fileids, string_types):
fileids = [fileids]
return concat([self._read_etree(self.xml(fileid)) for fileid in fileids])

View File

@@ -0,0 +1,297 @@
# Natural Language Toolkit: SemCor Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Nathan Schneider <nschneid@cs.cmu.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Corpus reader for the SemCor Corpus.
"""
from __future__ import absolute_import, unicode_literals
__docformat__ = 'epytext en'
from nltk.corpus.reader.api import *
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView
from nltk.tree import Tree
class SemcorCorpusReader(XMLCorpusReader):
"""
Corpus reader for the SemCor Corpus.
For access to the complete XML data structure, use the ``xml()``
method. For access to simple word lists and tagged word lists, use
``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.
"""
def __init__(self, root, fileids, wordnet, lazy=True):
XMLCorpusReader.__init__(self, root, fileids)
self._lazy = lazy
self._wordnet = wordnet
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
return self._items(fileids, 'word', False, False, False)
def chunks(self, fileids=None):
"""
:return: the given file(s) as a list of chunks,
each of which is a list of words and punctuation symbols
that form a unit.
:rtype: list(list(str))
"""
return self._items(fileids, 'chunk', False, False, False)
def tagged_chunks(self, fileids=None, tag=('pos' or 'sem' or 'both')):
"""
:return: the given file(s) as a list of tagged chunks, represented
in tree form.
:rtype: list(Tree)
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
to indicate the kind of tags to include. Semantic tags consist of
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
without a specific entry in WordNet. (Named entities of type 'other'
have no lemma. Other chunks not in WordNet have no semantic tag.
Punctuation tokens have `None` for their part of speech tag.)
"""
return self._items(fileids, 'chunk', False, tag != 'sem', tag != 'pos')
def sents(self, fileids=None):
"""
:return: the given file(s) as a list of sentences, each encoded
as a list of word strings.
:rtype: list(list(str))
"""
return self._items(fileids, 'word', True, False, False)
def chunk_sents(self, fileids=None):
"""
:return: the given file(s) as a list of sentences, each encoded
as a list of chunks.
:rtype: list(list(list(str)))
"""
return self._items(fileids, 'chunk', True, False, False)
def tagged_sents(self, fileids=None, tag=('pos' or 'sem' or 'both')):
"""
:return: the given file(s) as a list of sentences. Each sentence
is represented as a list of tagged chunks (in tree form).
:rtype: list(list(Tree))
:param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'`
to indicate the kind of tags to include. Semantic tags consist of
WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity
without a specific entry in WordNet. (Named entities of type 'other'
have no lemma. Other chunks not in WordNet have no semantic tag.
Punctuation tokens have `None` for their part of speech tag.)
"""
return self._items(fileids, 'chunk', True, tag != 'sem', tag != 'pos')
def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag):
if unit == 'word' and not bracket_sent:
# the result of the SemcorWordView may be a multiword unit, so the
# LazyConcatenation will make sure the sentence is flattened
_ = lambda *args: LazyConcatenation(
(SemcorWordView if self._lazy else self._words)(*args)
)
else:
_ = SemcorWordView if self._lazy else self._words
return concat(
[
_(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet)
for fileid in self.abspaths(fileids)
]
)
def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
"""
Helper used to implement the view methods -- returns a list of
tokens, (segmented) words, chunks, or sentences. The tokens
and chunks may optionally be tagged (with POS and sense
information).
:param fileid: The name of the underlying file.
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
:param bracket_sent: If true, include sentence bracketing.
:param pos_tag: Whether to include part-of-speech tags.
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
and OOV named entity status.
"""
assert unit in ('token', 'word', 'chunk')
result = []
xmldoc = ElementTree.parse(fileid).getroot()
for xmlsent in xmldoc.findall('.//s'):
sent = []
for xmlword in _all_xmlwords_in(xmlsent):
itm = SemcorCorpusReader._word(
xmlword, unit, pos_tag, sem_tag, self._wordnet
)
if unit == 'word':
sent.extend(itm)
else:
sent.append(itm)
if bracket_sent:
result.append(SemcorSentence(xmlsent.attrib['snum'], sent))
else:
result.extend(sent)
assert None not in result
return result
@staticmethod
def _word(xmlword, unit, pos_tag, sem_tag, wordnet):
tkn = xmlword.text
if not tkn:
tkn = "" # fixes issue 337?
lemma = xmlword.get('lemma', tkn) # lemma or NE class
lexsn = xmlword.get('lexsn') # lex_sense (locator for the lemma's sense)
if lexsn is not None:
sense_key = lemma + '%' + lexsn
wnpos = ('n', 'v', 'a', 'r', 's')[
int(lexsn.split(':')[0]) - 1
] # see http://wordnet.princeton.edu/man/senseidx.5WN.html
else:
sense_key = wnpos = None
redef = xmlword.get(
'rdf', tkn
) # redefinition--this indicates the lookup string
# does not exactly match the enclosed string, e.g. due to typographical adjustments
# or discontinuity of a multiword expression. If a redefinition has occurred,
# the "rdf" attribute holds its inflected form and "lemma" holds its lemma.
# For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class).
sensenum = xmlword.get('wnsn') # WordNet sense number
isOOVEntity = 'pn' in xmlword.keys() # a "personal name" (NE) not in WordNet
pos = xmlword.get(
'pos'
) # part of speech for the whole chunk (None for punctuation)
if unit == 'token':
if not pos_tag and not sem_tag:
itm = tkn
else:
itm = (
(tkn,)
+ ((pos,) if pos_tag else ())
+ ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ())
)
return itm
else:
ww = tkn.split('_') # TODO: case where punctuation intervenes in MWE
if unit == 'word':
return ww
else:
if sensenum is not None:
try:
sense = wordnet.lemma_from_key(sense_key) # Lemma object
except Exception:
# cannot retrieve the wordnet.Lemma object. possible reasons:
# (a) the wordnet corpus is not downloaded;
# (b) a nonexistant sense is annotated: e.g., such.s.00 triggers:
# nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00'
# solution: just use the lemma name as a string
try:
sense = '%s.%s.%02d' % (
lemma,
wnpos,
int(sensenum),
) # e.g.: reach.v.02
except ValueError:
sense = (
lemma + '.' + wnpos + '.' + sensenum
) # e.g. the sense number may be "2;1"
bottom = [Tree(pos, ww)] if pos_tag else ww
if sem_tag and isOOVEntity:
if sensenum is not None:
return Tree(sense, [Tree('NE', bottom)])
else: # 'other' NE
return Tree('NE', bottom)
elif sem_tag and sensenum is not None:
return Tree(sense, bottom)
elif pos_tag:
return bottom[0]
else:
return bottom # chunk as a list
def _all_xmlwords_in(elt, result=None):
if result is None:
result = []
for child in elt:
if child.tag in ('wf', 'punc'):
result.append(child)
else:
_all_xmlwords_in(child, result)
return result
class SemcorSentence(list):
"""
A list of words, augmented by an attribute ``num`` used to record
the sentence identifier (the ``n`` attribute from the XML).
"""
def __init__(self, num, items):
self.num = num
list.__init__(self, items)
class SemcorWordView(XMLCorpusView):
"""
A stream backed corpus view specialized for use with the BNC corpus.
"""
def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet):
"""
:param fileid: The name of the underlying file.
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
:param bracket_sent: If true, include sentence bracketing.
:param pos_tag: Whether to include part-of-speech tags.
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
and OOV named entity status.
"""
if bracket_sent:
tagspec = '.*/s'
else:
tagspec = '.*/s/(punc|wf)'
self._unit = unit
self._sent = bracket_sent
self._pos_tag = pos_tag
self._sem_tag = sem_tag
self._wordnet = wordnet
XMLCorpusView.__init__(self, fileid, tagspec)
def handle_elt(self, elt, context):
if self._sent:
return self.handle_sent(elt)
else:
return self.handle_word(elt)
def handle_word(self, elt):
return SemcorCorpusReader._word(
elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet
)
def handle_sent(self, elt):
sent = []
for child in elt:
if child.tag in ('wf', 'punc'):
itm = self.handle_word(child)
if self._unit == 'word':
sent.extend(itm)
else:
sent.append(itm)
else:
raise ValueError('Unexpected element %s' % child.tag)
return SemcorSentence(elt.attrib['snum'], sent)

View File

@@ -0,0 +1,212 @@
# Natural Language Toolkit: Senseval 2 Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
# Steven Bird <stevenbird1@gmail.com> (modifications)
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Read from the Senseval 2 Corpus.
SENSEVAL [http://www.senseval.org/]
Evaluation exercises for Word Sense Disambiguation.
Organized by ACL-SIGLEX [http://www.siglex.org/]
Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
http://www.d.umn.edu/~tpederse/data.html
Distributed with permission.
The NLTK version of the Senseval 2 files uses well-formed XML.
Each instance of the ambiguous words "hard", "interest", "line", and "serve"
is tagged with a sense identifier, and supplied with context.
"""
from __future__ import print_function, unicode_literals
import re
from xml.etree import ElementTree
from six import string_types
from nltk import compat
from nltk.tokenize import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
@compat.python_2_unicode_compatible
class SensevalInstance(object):
def __init__(self, word, position, context, senses):
self.word = word
self.senses = tuple(senses)
self.position = position
self.context = context
def __repr__(self):
return 'SensevalInstance(word=%r, position=%r, ' 'context=%r, senses=%r)' % (
self.word,
self.position,
self.context,
self.senses,
)
class SensevalCorpusReader(CorpusReader):
def instances(self, fileids=None):
return concat(
[
SensevalCorpusView(fileid, enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def raw(self, fileids=None):
"""
:return: the text contents of the given fileids, as a single string.
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def _entry(self, tree):
elts = []
for lexelt in tree.findall('lexelt'):
for inst in lexelt.findall('instance'):
sense = inst[0].attrib['senseid']
context = [(w.text, w.attrib['pos']) for w in inst[1]]
elts.append((sense, context))
return elts
class SensevalCorpusView(StreamBackedCorpusView):
def __init__(self, fileid, encoding):
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
self._word_tokenizer = WhitespaceTokenizer()
self._lexelt_starts = [0] # list of streampos
self._lexelts = [None] # list of lexelt names
def read_block(self, stream):
# Decide which lexical element we're in.
lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1
lexelt = self._lexelts[lexelt_num]
instance_lines = []
in_instance = False
while True:
line = stream.readline()
if line == '':
assert instance_lines == []
return []
# Start of a lexical element?
if line.lstrip().startswith('<lexelt'):
lexelt_num += 1
m = re.search('item=("[^"]+"|\'[^\']+\')', line)
assert m is not None # <lexelt> has no 'item=...'
lexelt = m.group(1)[1:-1]
if lexelt_num < len(self._lexelts):
assert lexelt == self._lexelts[lexelt_num]
else:
self._lexelts.append(lexelt)
self._lexelt_starts.append(stream.tell())
# Start of an instance?
if line.lstrip().startswith('<instance'):
assert instance_lines == []
in_instance = True
# Body of an instance?
if in_instance:
instance_lines.append(line)
# End of an instance?
if line.lstrip().startswith('</instance'):
xml_block = '\n'.join(instance_lines)
xml_block = _fixXML(xml_block)
inst = ElementTree.fromstring(xml_block)
return [self._parse_instance(inst, lexelt)]
def _parse_instance(self, instance, lexelt):
senses = []
context = []
position = None
for child in instance:
if child.tag == 'answer':
senses.append(child.attrib['senseid'])
elif child.tag == 'context':
context += self._word_tokenizer.tokenize(child.text)
for cword in child:
if cword.tag == 'compound':
cword = cword[0] # is this ok to do?
if cword.tag == 'head':
# Some santiy checks:
assert position is None, 'head specified twice'
assert cword.text.strip() or len(cword) == 1
assert not (cword.text.strip() and len(cword) == 1)
# Record the position of the head:
position = len(context)
# Addd on the head word itself:
if cword.text.strip():
context.append(cword.text.strip())
elif cword[0].tag == 'wf':
context.append((cword[0].text, cword[0].attrib['pos']))
if cword[0].tail:
context += self._word_tokenizer.tokenize(cword[0].tail)
else:
assert False, 'expected CDATA or wf in <head>'
elif cword.tag == 'wf':
context.append((cword.text, cword.attrib['pos']))
elif cword.tag == 's':
pass # Sentence boundary marker.
else:
print('ACK', cword.tag)
assert False, 'expected CDATA or <wf> or <head>'
if cword.tail:
context += self._word_tokenizer.tokenize(cword.tail)
else:
assert False, 'unexpected tag %s' % child.tag
return SensevalInstance(lexelt, position, context, senses)
def _fixXML(text):
"""
Fix the various issues with Senseval pseudo-XML.
"""
# <~> or <^> => ~ or ^
text = re.sub(r'<([~\^])>', r'\1', text)
# fix lone &
text = re.sub(r'(\s+)\&(\s+)', r'\1&amp;\2', text)
# fix """
text = re.sub(r'"""', '\'"\'', text)
# fix <s snum=dd> => <s snum="dd"/>
text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text)
# fix foreign word tag
text = re.sub(r'<\&frasl>\s*<p[^>]*>', 'FRASL', text)
# remove <&I .>
text = re.sub(r'<\&I[^>]*>', '', text)
# fix <{word}>
text = re.sub(r'<{([^}]+)}>', r'\1', text)
# remove <@>, <p>, </p>
text = re.sub(r'<(@|/?p)>', r'', text)
# remove <&M .> and <&T .> and <&Ms .>
text = re.sub(r'<&\w+ \.>', r'', text)
# remove <!DOCTYPE... > lines
text = re.sub(r'<!DOCTYPE[^>]*>', r'', text)
# remove <[hi]> and <[/p]> etc
text = re.sub(r'<\[\/?[^>]+\]*>', r'', text)
# take the thing out of the brackets: <&hellip;>
text = re.sub(r'<(\&\w+;)>', r'\1', text)
# and remove the & for those patterns that aren't regular XML
text = re.sub(r'&(?!amp|gt|lt|apos|quot)', r'', text)
# fix 'abc <p="foo"/>' style tags - now <wf pos="foo">abc</wf>
text = re.sub(
r'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>', r' <wf pos="\2">\1</wf>', text
)
text = re.sub(r'\s*"\s*<p=\'"\'/>', " <wf pos='\"'>\"</wf>", text)
return text

View File

@@ -0,0 +1,139 @@
# -*- coding: utf-8 -*-
# Natural Language Toolkit: SentiWordNet
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Christopher Potts <cgpotts@stanford.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
An NLTK interface for SentiWordNet
SentiWordNet is a lexical resource for opinion mining.
SentiWordNet assigns to each synset of WordNet three
sentiment scores: positivity, negativity, and objectivity.
For details about SentiWordNet see:
http://sentiwordnet.isti.cnr.it/
>>> from nltk.corpus import sentiwordnet as swn
>>> print(swn.senti_synset('breakdown.n.03'))
<breakdown.n.03: PosScore=0.0 NegScore=0.25>
>>> list(swn.senti_synsets('slow'))
[SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),
SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),
SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),
SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),
SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'),
SentiSynset('behind.r.03')]
>>> happy = swn.senti_synsets('happy', 'a')
>>> happy0 = list(happy)[0]
>>> happy0.pos_score()
0.875
>>> happy0.neg_score()
0.0
>>> happy0.obj_score()
0.125
"""
import re
from nltk.compat import python_2_unicode_compatible
from nltk.corpus.reader import CorpusReader
@python_2_unicode_compatible
class SentiWordNetCorpusReader(CorpusReader):
def __init__(self, root, fileids, encoding='utf-8'):
"""
Construct a new SentiWordNet Corpus Reader, using data from
the specified file.
"""
super(SentiWordNetCorpusReader, self).__init__(root, fileids, encoding=encoding)
if len(self._fileids) != 1:
raise ValueError('Exactly one file must be specified')
self._db = {}
self._parse_src_file()
def _parse_src_file(self):
lines = self.open(self._fileids[0]).read().splitlines()
lines = filter((lambda x: not re.search(r"^\s*#", x)), lines)
for i, line in enumerate(lines):
fields = [field.strip() for field in re.split(r"\t+", line)]
try:
pos, offset, pos_score, neg_score, synset_terms, gloss = fields
except:
raise ValueError('Line %s formatted incorrectly: %s\n' % (i, line))
if pos and offset:
offset = int(offset)
self._db[(pos, offset)] = (float(pos_score), float(neg_score))
def senti_synset(self, *vals):
from nltk.corpus import wordnet as wn
if tuple(vals) in self._db:
pos_score, neg_score = self._db[tuple(vals)]
pos, offset = vals
if pos == 's':
pos = 'a'
synset = wn.synset_from_pos_and_offset(pos, offset)
return SentiSynset(pos_score, neg_score, synset)
else:
synset = wn.synset(vals[0])
pos = synset.pos()
if pos == 's':
pos = 'a'
offset = synset.offset()
if (pos, offset) in self._db:
pos_score, neg_score = self._db[(pos, offset)]
return SentiSynset(pos_score, neg_score, synset)
else:
return None
def senti_synsets(self, string, pos=None):
from nltk.corpus import wordnet as wn
sentis = []
synset_list = wn.synsets(string, pos)
for synset in synset_list:
sentis.append(self.senti_synset(synset.name()))
sentis = filter(lambda x: x, sentis)
return sentis
def all_senti_synsets(self):
from nltk.corpus import wordnet as wn
for key, fields in self._db.items():
pos, offset = key
pos_score, neg_score = fields
synset = wn.synset_from_pos_and_offset(pos, offset)
yield SentiSynset(pos_score, neg_score, synset)
@python_2_unicode_compatible
class SentiSynset(object):
def __init__(self, pos_score, neg_score, synset):
self._pos_score = pos_score
self._neg_score = neg_score
self._obj_score = 1.0 - (self._pos_score + self._neg_score)
self.synset = synset
def pos_score(self):
return self._pos_score
def neg_score(self):
return self._neg_score
def obj_score(self):
return self._obj_score
def __str__(self):
"""Prints just the Pos/Neg scores for now."""
s = "<"
s += self.synset.name() + ": "
s += "PosScore=%s " % self._pos_score
s += "NegScore=%s" % self._neg_score
s += ">"
return s
def __repr__(self):
return "Senti" + repr(self.synset)

View File

@@ -0,0 +1,76 @@
# Natural Language Toolkit: Sinica Treebank Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Sinica Treebank Corpus Sample
http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm
10,000 parsed sentences, drawn from the Academia Sinica Balanced
Corpus of Modern Chinese. Parse tree notation is based on
Information-based Case Grammar. Tagset documentation is available
at http://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html
Language and Knowledge Processing Group, Institute of Information
Science, Academia Sinica
The data is distributed with the Natural Language Toolkit under the terms of
the Creative Commons Attribution-NonCommercial-ShareAlike License
[http://creativecommons.org/licenses/by-nc-sa/2.5/].
References:
Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999)
The Construction of Sinica Treebank. Computational Linguistics and
Chinese Language Processing, 4, pp 87-104.
Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming
Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria,
Annotation Guidelines, and On-line Interface. Proceedings of 2nd
Chinese Language Processing Workshop, Association for Computational
Linguistics.
Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar
Extraction, Proceedings of IJCNLP-04, pp560-565.
"""
from nltk.tree import sinica_parse
from nltk.tag import map_tag
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
IDENTIFIER = re.compile(r'^#\S+\s')
APPENDIX = re.compile(r'(?<=\))#.*$')
TAGWORD = re.compile(r':([^:()|]+):([^:()|]+)')
WORD = re.compile(r':[^:()|]+:([^:()|]+)')
class SinicaTreebankCorpusReader(SyntaxCorpusReader):
"""
Reader for the sinica treebank.
"""
def _read_block(self, stream):
sent = stream.readline()
sent = IDENTIFIER.sub('', sent)
sent = APPENDIX.sub('', sent)
return [sent]
def _parse(self, sent):
return sinica_parse(sent)
def _tag(self, sent, tagset=None):
tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)]
if tagset and tagset != self._tagset:
tagged_sent = [
(w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent
]
return tagged_sent
def _word(self, sent):
return WORD.findall(sent)

View File

@@ -0,0 +1,67 @@
# Natural Language Toolkit: String Category Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Read tuples from a corpus consisting of categorized strings.
For example, from the question classification corpus:
NUM:dist How far is it from Denver to Aspen ?
LOC:city What county is Modesto , California in ?
HUM:desc Who was Galileo ?
DESC:def What is an atom ?
NUM:date When did Hawaii become a state ?
"""
# based on PPAttachmentCorpusReader
from six import string_types
from nltk import compat
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
# [xx] Should the order of the tuple be reversed -- in most other places
# in nltk, we use the form (data, tag) -- e.g., tagged words and
# labeled texts for classifiers.
class StringCategoryCorpusReader(CorpusReader):
def __init__(self, root, fileids, delimiter=' ', encoding='utf8'):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param delimiter: Field delimiter
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._delimiter = delimiter
def tuples(self, fileids=None):
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat(
[
StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def raw(self, fileids=None):
"""
:return: the text contents of the given fileids, as a single string.
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def _read_tuple_block(self, stream):
line = stream.readline().strip()
if line:
return [tuple(line.split(self._delimiter, 1))]
else:
return []

View File

@@ -0,0 +1,129 @@
# Natural Language Toolkit: Switchboard Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import unicode_literals
import re
from nltk.tag import str2tuple, map_tag
from nltk import compat
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
@compat.python_2_unicode_compatible
class SwitchboardTurn(list):
"""
A specialized list object used to encode switchboard utterances.
The elements of the list are the words in the utterance; and two
attributes, ``speaker`` and ``id``, are provided to retrieve the
spearker identifier and utterance id. Note that utterance ids
are only unique within a given discourse.
"""
def __init__(self, words, speaker, id):
list.__init__(self, words)
self.speaker = speaker
self.id = int(id)
def __repr__(self):
if len(self) == 0:
text = ''
elif isinstance(self[0], tuple):
text = ' '.join('%s/%s' % w for w in self)
else:
text = ' '.join(self)
return '<%s.%s: %r>' % (self.speaker, self.id, text)
class SwitchboardCorpusReader(CorpusReader):
_FILES = ['tagged']
# Use the "tagged" file even for non-tagged data methods, since
# it's tokenized.
def __init__(self, root, tagset=None):
CorpusReader.__init__(self, root, self._FILES)
self._tagset = tagset
def words(self):
return StreamBackedCorpusView(self.abspath('tagged'), self._words_block_reader)
def tagged_words(self, tagset=None):
def tagged_words_block_reader(stream):
return self._tagged_words_block_reader(stream, tagset)
return StreamBackedCorpusView(self.abspath('tagged'), tagged_words_block_reader)
def turns(self):
return StreamBackedCorpusView(self.abspath('tagged'), self._turns_block_reader)
def tagged_turns(self, tagset=None):
def tagged_turns_block_reader(stream):
return self._tagged_turns_block_reader(stream, tagset)
return StreamBackedCorpusView(self.abspath('tagged'), tagged_turns_block_reader)
def discourses(self):
return StreamBackedCorpusView(
self.abspath('tagged'), self._discourses_block_reader
)
def tagged_discourses(self, tagset=False):
def tagged_discourses_block_reader(stream):
return self._tagged_discourses_block_reader(stream, tagset)
return StreamBackedCorpusView(
self.abspath('tagged'), tagged_discourses_block_reader
)
def _discourses_block_reader(self, stream):
# returns at most 1 discourse. (The other methods depend on this.)
return [
[
self._parse_utterance(u, include_tag=False)
for b in read_blankline_block(stream)
for u in b.split('\n')
if u.strip()
]
]
def _tagged_discourses_block_reader(self, stream, tagset=None):
# returns at most 1 discourse. (The other methods depend on this.)
return [
[
self._parse_utterance(u, include_tag=True, tagset=tagset)
for b in read_blankline_block(stream)
for u in b.split('\n')
if u.strip()
]
]
def _turns_block_reader(self, stream):
return self._discourses_block_reader(stream)[0]
def _tagged_turns_block_reader(self, stream, tagset=None):
return self._tagged_discourses_block_reader(stream, tagset)[0]
def _words_block_reader(self, stream):
return sum(self._discourses_block_reader(stream)[0], [])
def _tagged_words_block_reader(self, stream, tagset=None):
return sum(self._tagged_discourses_block_reader(stream, tagset)[0], [])
_UTTERANCE_RE = re.compile('(\w+)\.(\d+)\:\s*(.*)')
_SEP = '/'
def _parse_utterance(self, utterance, include_tag, tagset=None):
m = self._UTTERANCE_RE.match(utterance)
if m is None:
raise ValueError('Bad utterance %r' % utterance)
speaker, id, text = m.groups()
words = [str2tuple(s, self._SEP) for s in text.split()]
if not include_tag:
words = [w for (w, t) in words]
elif tagset and tagset != self._tagset:
words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words]
return SwitchboardTurn(words, speaker, id)

View File

@@ -0,0 +1,394 @@
# Natural Language Toolkit: Tagged Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Jacob Perkins <japerk@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
A reader for corpora whose documents contain part-of-speech-tagged words.
"""
import os
from six import string_types
from nltk.tag import str2tuple, map_tag
from nltk.tokenize import *
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.timit import read_timit_block
class TaggedCorpusReader(CorpusReader):
"""
Reader for simple part-of-speech tagged corpora. Paragraphs are
assumed to be split using blank lines. Sentences and words can be
tokenized using the default tokenizers, or by custom tokenizers
specified as parameters to the constructor. Words are parsed
using ``nltk.tag.str2tuple``. By default, ``'/'`` is used as the
separator. I.e., words should have the form::
word1/tag1 word2/tag2 word3/tag3 ...
But custom separators may be specified as parameters to the
constructor. Part of speech tags are case-normalized to upper
case.
"""
def __init__(
self,
root,
fileids,
sep='/',
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
para_block_reader=read_blankline_block,
encoding='utf8',
tagset=None,
):
"""
Construct a new Tagged Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = TaggedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._para_block_reader = para_block_reader
self._tagset = tagset
def raw(self, fileids=None):
"""
:return: the given file(s) as a single string.
:rtype: str
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
TaggedCorpusView(
fileid,
enc,
False,
False,
False,
self._sep,
self._word_tokenizer,
self._sent_tokenizer,
self._para_block_reader,
None,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def sents(self, fileids=None):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
"""
return concat(
[
TaggedCorpusView(
fileid,
enc,
False,
True,
False,
self._sep,
self._word_tokenizer,
self._sent_tokenizer,
self._para_block_reader,
None,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def paras(self, fileids=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
"""
return concat(
[
TaggedCorpusView(
fileid,
enc,
False,
True,
True,
self._sep,
self._word_tokenizer,
self._sent_tokenizer,
self._para_block_reader,
None,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_words(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of tagged
words and punctuation symbols, encoded as tuples
``(word,tag)``.
:rtype: list(tuple(str,str))
"""
if tagset and tagset != self._tagset:
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
else:
tag_mapping_function = None
return concat(
[
TaggedCorpusView(
fileid,
enc,
True,
False,
False,
self._sep,
self._word_tokenizer,
self._sent_tokenizer,
self._para_block_reader,
tag_mapping_function,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_sents(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
sentences, each encoded as a list of ``(word,tag)`` tuples.
:rtype: list(list(tuple(str,str)))
"""
if tagset and tagset != self._tagset:
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
else:
tag_mapping_function = None
return concat(
[
TaggedCorpusView(
fileid,
enc,
True,
True,
False,
self._sep,
self._word_tokenizer,
self._sent_tokenizer,
self._para_block_reader,
tag_mapping_function,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def tagged_paras(self, fileids=None, tagset=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of ``(word,tag)`` tuples.
:rtype: list(list(list(tuple(str,str))))
"""
if tagset and tagset != self._tagset:
tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t)
else:
tag_mapping_function = None
return concat(
[
TaggedCorpusView(
fileid,
enc,
True,
True,
True,
self._sep,
self._word_tokenizer,
self._sent_tokenizer,
self._para_block_reader,
tag_mapping_function,
)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
class CategorizedTaggedCorpusReader(CategorizedCorpusReader, TaggedCorpusReader):
"""
A reader for part-of-speech tagged corpora whose documents are
divided into categories based on their file identifiers.
"""
def __init__(self, *args, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining arguments
are passed to the ``TaggedCorpusReader``.
"""
CategorizedCorpusReader.__init__(self, kwargs)
TaggedCorpusReader.__init__(self, *args, **kwargs)
def _resolve(self, fileids, categories):
if fileids is not None and categories is not None:
raise ValueError('Specify fileids or categories, not both')
if categories is not None:
return self.fileids(categories)
else:
return fileids
def raw(self, fileids=None, categories=None):
return TaggedCorpusReader.raw(self, self._resolve(fileids, categories))
def words(self, fileids=None, categories=None):
return TaggedCorpusReader.words(self, self._resolve(fileids, categories))
def sents(self, fileids=None, categories=None):
return TaggedCorpusReader.sents(self, self._resolve(fileids, categories))
def paras(self, fileids=None, categories=None):
return TaggedCorpusReader.paras(self, self._resolve(fileids, categories))
def tagged_words(self, fileids=None, categories=None, tagset=None):
return TaggedCorpusReader.tagged_words(
self, self._resolve(fileids, categories), tagset
)
def tagged_sents(self, fileids=None, categories=None, tagset=None):
return TaggedCorpusReader.tagged_sents(
self, self._resolve(fileids, categories), tagset
)
def tagged_paras(self, fileids=None, categories=None, tagset=None):
return TaggedCorpusReader.tagged_paras(
self, self._resolve(fileids, categories), tagset
)
class TaggedCorpusView(StreamBackedCorpusView):
"""
A specialized corpus view for tagged documents. It can be
customized via flags to divide the tagged corpus documents up by
sentence or paragraph, and to include or omit part of speech tags.
``TaggedCorpusView`` objects are typically created by
``TaggedCorpusReader`` (not directly by nltk users).
"""
def __init__(
self,
corpus_file,
encoding,
tagged,
group_by_sent,
group_by_para,
sep,
word_tokenizer,
sent_tokenizer,
para_block_reader,
tag_mapping_function=None,
):
self._tagged = tagged
self._group_by_sent = group_by_sent
self._group_by_para = group_by_para
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._para_block_reader = para_block_reader
self._tag_mapping_function = tag_mapping_function
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def read_block(self, stream):
"""Reads one paragraph at a time."""
block = []
for para_str in self._para_block_reader(stream):
para = []
for sent_str in self._sent_tokenizer.tokenize(para_str):
sent = [
str2tuple(s, self._sep)
for s in self._word_tokenizer.tokenize(sent_str)
]
if self._tag_mapping_function:
sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent]
if not self._tagged:
sent = [w for (w, t) in sent]
if self._group_by_sent:
para.append(sent)
else:
para.extend(sent)
if self._group_by_para:
block.append(para)
else:
block.extend(para)
return block
# needs to implement simplified tags
class MacMorphoCorpusReader(TaggedCorpusReader):
"""
A corpus reader for the MAC_MORPHO corpus. Each line contains a
single tagged word, using '_' as a separator. Sentence boundaries
are based on the end-sentence tag ('_.'). Paragraph information
is not included in the corpus, so each paragraph returned by
``self.paras()`` and ``self.tagged_paras()`` contains a single
sentence.
"""
def __init__(self, root, fileids, encoding='utf8', tagset=None):
TaggedCorpusReader.__init__(
self,
root,
fileids,
sep='_',
word_tokenizer=LineTokenizer(),
sent_tokenizer=RegexpTokenizer('.*\n'),
para_block_reader=self._read_block,
encoding=encoding,
tagset=tagset,
)
def _read_block(self, stream):
return read_regexp_block(stream, r'.*', r'.*_\.')
class TimitTaggedCorpusReader(TaggedCorpusReader):
"""
A corpus reader for tagged sentences that are included in the TIMIT corpus.
"""
def __init__(self, *args, **kwargs):
TaggedCorpusReader.__init__(
self, para_block_reader=read_timit_block, *args, **kwargs
)
def paras(self):
raise NotImplementedError('use sents() instead')
def tagged_paras(self):
raise NotImplementedError('use tagged_sents() instead')

View File

@@ -0,0 +1,499 @@
# Natural Language Toolkit: TIMIT Corpus Reader
#
# Copyright (C) 2001-2007 NLTK Project
# Author: Haejoong Lee <haejoong@ldc.upenn.edu>
# Steven Bird <stevenbird1@gmail.com>
# Jacob Perkins <japerk@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
# [xx] this docstring is out-of-date:
"""
Read tokens, phonemes and audio data from the NLTK TIMIT Corpus.
This corpus contains selected portion of the TIMIT corpus.
- 16 speakers from 8 dialect regions
- 1 male and 1 female from each dialect region
- total 130 sentences (10 sentences per speaker. Note that some
sentences are shared among other speakers, especially sa1 and sa2
are spoken by all speakers.)
- total 160 recording of sentences (10 recordings per speaker)
- audio format: NIST Sphere, single channel, 16kHz sampling,
16 bit sample, PCM encoding
Module contents
===============
The timit corpus reader provides 4 functions and 4 data items.
- utterances
List of utterances in the corpus. There are total 160 utterances,
each of which corresponds to a unique utterance of a speaker.
Here's an example of an utterance identifier in the list::
dr1-fvmh0/sx206
- _---- _---
| | | | |
| | | | |
| | | | `--- sentence number
| | | `----- sentence type (a:all, i:shared, x:exclusive)
| | `--------- speaker ID
| `------------ sex (m:male, f:female)
`-------------- dialect region (1..8)
- speakers
List of speaker IDs. An example of speaker ID::
dr1-fvmh0
Note that if you split an item ID with colon and take the first element of
the result, you will get a speaker ID.
>>> itemid = 'dr1-fvmh0/sx206'
>>> spkrid , sentid = itemid.split('/')
>>> spkrid
'dr1-fvmh0'
The second element of the result is a sentence ID.
- dictionary()
Phonetic dictionary of words contained in this corpus. This is a Python
dictionary from words to phoneme lists.
- spkrinfo()
Speaker information table. It's a Python dictionary from speaker IDs to
records of 10 fields. Speaker IDs the same as the ones in timie.speakers.
Each record is a dictionary from field names to values, and the fields are
as follows::
id speaker ID as defined in the original TIMIT speaker info table
sex speaker gender (M:male, F:female)
dr speaker dialect region (1:new england, 2:northern,
3:north midland, 4:south midland, 5:southern, 6:new york city,
7:western, 8:army brat (moved around))
use corpus type (TRN:training, TST:test)
in this sample corpus only TRN is available
recdate recording date
birthdate speaker birth date
ht speaker height
race speaker race (WHT:white, BLK:black, AMR:american indian,
SPN:spanish-american, ORN:oriental,???:unknown)
edu speaker education level (HS:high school, AS:associate degree,
BS:bachelor's degree (BS or BA), MS:master's degree (MS or MA),
PHD:doctorate degree (PhD,JD,MD), ??:unknown)
comments comments by the recorder
The 4 functions are as follows.
- tokenized(sentences=items, offset=False)
Given a list of items, returns an iterator of a list of word lists,
each of which corresponds to an item (sentence). If offset is set to True,
each element of the word list is a tuple of word(string), start offset and
end offset, where offset is represented as a number of 16kHz samples.
- phonetic(sentences=items, offset=False)
Given a list of items, returns an iterator of a list of phoneme lists,
each of which corresponds to an item (sentence). If offset is set to True,
each element of the phoneme list is a tuple of word(string), start offset
and end offset, where offset is represented as a number of 16kHz samples.
- audiodata(item, start=0, end=None)
Given an item, returns a chunk of audio samples formatted into a string.
When the fuction is called, if start and end are omitted, the entire
samples of the recording will be returned. If only end is omitted,
samples from the start offset to the end of the recording will be returned.
- play(data)
Play the given audio samples. The audio samples can be obtained from the
timit.audiodata function.
"""
from __future__ import print_function, unicode_literals
import sys
import os
import re
import tempfile
import time
from six import string_types
from nltk import compat
from nltk.tree import Tree
from nltk.internals import import_from_stdlib
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
class TimitCorpusReader(CorpusReader):
"""
Reader for the TIMIT corpus (or any other corpus with the same
file layout and use of file formats). The corpus root directory
should contain the following files:
- timitdic.txt: dictionary of standard transcriptions
- spkrinfo.txt: table of speaker information
In addition, the root directory should contain one subdirectory
for each speaker, containing three files for each utterance:
- <utterance-id>.txt: text content of utterances
- <utterance-id>.wrd: tokenized text content of utterances
- <utterance-id>.phn: phonetic transcription of utterances
- <utterance-id>.wav: utterance sound file
"""
_FILE_RE = r'(\w+-\w+/\w+\.(phn|txt|wav|wrd))|' + r'timitdic\.txt|spkrinfo\.txt'
"""A regexp matching fileids that are used by this corpus reader."""
_UTTERANCE_RE = r'\w+-\w+/\w+\.txt'
def __init__(self, root, encoding='utf8'):
"""
Construct a new TIMIT corpus reader in the given directory.
:param root: The root directory for this corpus.
"""
# Ensure that wave files don't get treated as unicode data:
if isinstance(encoding, string_types):
encoding = [('.*\.wav', None), ('.*', encoding)]
CorpusReader.__init__(
self, root, find_corpus_fileids(root, self._FILE_RE), encoding=encoding
)
self._utterances = [
name[:-4] for name in find_corpus_fileids(root, self._UTTERANCE_RE)
]
"""A list of the utterance identifiers for all utterances in
this corpus."""
self._speakerinfo = None
self._root = root
self.speakers = sorted(set(u.split('/')[0] for u in self._utterances))
def fileids(self, filetype=None):
"""
Return a list of file identifiers for the files that make up
this corpus.
:param filetype: If specified, then ``filetype`` indicates that
only the files that have the given type should be
returned. Accepted values are: ``txt``, ``wrd``, ``phn``,
``wav``, or ``metadata``,
"""
if filetype is None:
return CorpusReader.fileids(self)
elif filetype in ('txt', 'wrd', 'phn', 'wav'):
return ['%s.%s' % (u, filetype) for u in self._utterances]
elif filetype == 'metadata':
return ['timitdic.txt', 'spkrinfo.txt']
else:
raise ValueError('Bad value for filetype: %r' % filetype)
def utteranceids(
self, dialect=None, sex=None, spkrid=None, sent_type=None, sentid=None
):
"""
:return: A list of the utterance identifiers for all
utterances in this corpus, or for the given speaker, dialect
region, gender, sentence type, or sentence number, if
specified.
"""
if isinstance(dialect, string_types):
dialect = [dialect]
if isinstance(sex, string_types):
sex = [sex]
if isinstance(spkrid, string_types):
spkrid = [spkrid]
if isinstance(sent_type, string_types):
sent_type = [sent_type]
if isinstance(sentid, string_types):
sentid = [sentid]
utterances = self._utterances[:]
if dialect is not None:
utterances = [u for u in utterances if u[2] in dialect]
if sex is not None:
utterances = [u for u in utterances if u[4] in sex]
if spkrid is not None:
utterances = [u for u in utterances if u[:9] in spkrid]
if sent_type is not None:
utterances = [u for u in utterances if u[11] in sent_type]
if sentid is not None:
utterances = [u for u in utterances if u[10:] in spkrid]
return utterances
def transcription_dict(self):
"""
:return: A dictionary giving the 'standard' transcription for
each word.
"""
_transcriptions = {}
for line in self.open('timitdic.txt'):
if not line.strip() or line[0] == ';':
continue
m = re.match(r'\s*(\S+)\s+/(.*)/\s*$', line)
if not m:
raise ValueError('Bad line: %r' % line)
_transcriptions[m.group(1)] = m.group(2).split()
return _transcriptions
def spkrid(self, utterance):
return utterance.split('/')[0]
def sentid(self, utterance):
return utterance.split('/')[1]
def utterance(self, spkrid, sentid):
return '%s/%s' % (spkrid, sentid)
def spkrutteranceids(self, speaker):
"""
:return: A list of all utterances associated with a given
speaker.
"""
return [
utterance
for utterance in self._utterances
if utterance.startswith(speaker + '/')
]
def spkrinfo(self, speaker):
"""
:return: A dictionary mapping .. something.
"""
if speaker in self._utterances:
speaker = self.spkrid(speaker)
if self._speakerinfo is None:
self._speakerinfo = {}
for line in self.open('spkrinfo.txt'):
if not line.strip() or line[0] == ';':
continue
rec = line.strip().split(None, 9)
key = "dr%s-%s%s" % (rec[2], rec[1].lower(), rec[0].lower())
self._speakerinfo[key] = SpeakerInfo(*rec)
return self._speakerinfo[speaker]
def phones(self, utterances=None):
return [
line.split()[-1]
for fileid in self._utterance_fileids(utterances, '.phn')
for line in self.open(fileid)
if line.strip()
]
def phone_times(self, utterances=None):
"""
offset is represented as a number of 16kHz samples!
"""
return [
(line.split()[2], int(line.split()[0]), int(line.split()[1]))
for fileid in self._utterance_fileids(utterances, '.phn')
for line in self.open(fileid)
if line.strip()
]
def words(self, utterances=None):
return [
line.split()[-1]
for fileid in self._utterance_fileids(utterances, '.wrd')
for line in self.open(fileid)
if line.strip()
]
def word_times(self, utterances=None):
return [
(line.split()[2], int(line.split()[0]), int(line.split()[1]))
for fileid in self._utterance_fileids(utterances, '.wrd')
for line in self.open(fileid)
if line.strip()
]
def sents(self, utterances=None):
return [
[line.split()[-1] for line in self.open(fileid) if line.strip()]
for fileid in self._utterance_fileids(utterances, '.wrd')
]
def sent_times(self, utterances=None):
return [
(
line.split(None, 2)[-1].strip(),
int(line.split()[0]),
int(line.split()[1]),
)
for fileid in self._utterance_fileids(utterances, '.txt')
for line in self.open(fileid)
if line.strip()
]
def phone_trees(self, utterances=None):
if utterances is None:
utterances = self._utterances
if isinstance(utterances, string_types):
utterances = [utterances]
trees = []
for utterance in utterances:
word_times = self.word_times(utterance)
phone_times = self.phone_times(utterance)
sent_times = self.sent_times(utterance)
while sent_times:
(sent, sent_start, sent_end) = sent_times.pop(0)
trees.append(Tree('S', []))
while (
word_times and phone_times and phone_times[0][2] <= word_times[0][1]
):
trees[-1].append(phone_times.pop(0)[0])
while word_times and word_times[0][2] <= sent_end:
(word, word_start, word_end) = word_times.pop(0)
trees[-1].append(Tree(word, []))
while phone_times and phone_times[0][2] <= word_end:
trees[-1][-1].append(phone_times.pop(0)[0])
while phone_times and phone_times[0][2] <= sent_end:
trees[-1].append(phone_times.pop(0)[0])
return trees
# [xx] NOTE: This is currently broken -- we're assuming that the
# fileids are WAV fileids (aka RIFF), but they're actually NIST SPHERE
# fileids.
def wav(self, utterance, start=0, end=None):
# nltk.chunk conflicts with the stdlib module 'chunk'
wave = import_from_stdlib('wave')
w = wave.open(self.open(utterance + '.wav'), 'rb')
if end is None:
end = w.getnframes()
# Skip past frames before start, then read the frames we want
w.readframes(start)
frames = w.readframes(end - start)
# Open a new temporary file -- the wave module requires
# an actual file, and won't work w/ stringio. :(
tf = tempfile.TemporaryFile()
out = wave.open(tf, 'w')
# Write the parameters & data to the new file.
out.setparams(w.getparams())
out.writeframes(frames)
out.close()
# Read the data back from the file, and return it. The
# file will automatically be deleted when we return.
tf.seek(0)
return tf.read()
def audiodata(self, utterance, start=0, end=None):
assert end is None or end > start
headersize = 44
if end is None:
data = self.open(utterance + '.wav').read()
else:
data = self.open(utterance + '.wav').read(headersize + end * 2)
return data[headersize + start * 2 :]
def _utterance_fileids(self, utterances, extension):
if utterances is None:
utterances = self._utterances
if isinstance(utterances, string_types):
utterances = [utterances]
return ['%s%s' % (u, extension) for u in utterances]
def play(self, utterance, start=0, end=None):
"""
Play the given audio sample.
:param utterance: The utterance id of the sample to play
"""
# Method 1: os audio dev.
try:
import ossaudiodev
try:
dsp = ossaudiodev.open('w')
dsp.setfmt(ossaudiodev.AFMT_S16_LE)
dsp.channels(1)
dsp.speed(16000)
dsp.write(self.audiodata(utterance, start, end))
dsp.close()
except IOError as e:
print(
(
"can't acquire the audio device; please "
"activate your audio device."
),
file=sys.stderr,
)
print("system error message:", str(e), file=sys.stderr)
return
except ImportError:
pass
# Method 2: pygame
try:
# FIXME: this won't work under python 3
import pygame.mixer, StringIO
pygame.mixer.init(16000)
f = StringIO.StringIO(self.wav(utterance, start, end))
pygame.mixer.Sound(f).play()
while pygame.mixer.get_busy():
time.sleep(0.01)
return
except ImportError:
pass
# Method 3: complain. :)
print(
("you must install pygame or ossaudiodev " "for audio playback."),
file=sys.stderr,
)
@compat.python_2_unicode_compatible
class SpeakerInfo(object):
def __init__(
self, id, sex, dr, use, recdate, birthdate, ht, race, edu, comments=None
):
self.id = id
self.sex = sex
self.dr = dr
self.use = use
self.recdate = recdate
self.birthdate = birthdate
self.ht = ht
self.race = race
self.edu = edu
self.comments = comments
def __repr__(self):
attribs = 'id sex dr use recdate birthdate ht race edu comments'
args = ['%s=%r' % (attr, getattr(self, attr)) for attr in attribs.split()]
return 'SpeakerInfo(%s)' % (', '.join(args))
def read_timit_block(stream):
"""
Block reader for timit tagged sentences, which are preceded by a sentence
number that will be ignored.
"""
line = stream.readline()
if not line:
return []
n, sent = line.split(' ', 1)
return [sent]

View File

@@ -0,0 +1,83 @@
# Natural Language Toolkit: Toolbox Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Greg Aumann <greg_aumann@sil.org>
# Stuart Robinson <Stuart.Robinson@mpi.nl>
# Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Module for reading, writing and manipulating
Toolbox databases and settings fileids.
"""
from nltk.toolbox import ToolboxData
from nltk.corpus.reader.util import *
from nltk.corpus.reader.api import *
class ToolboxCorpusReader(CorpusReader):
def xml(self, fileids, key=None):
return concat(
[
ToolboxData(path, enc).parse(key=key)
for (path, enc) in self.abspaths(fileids, True)
]
)
def fields(
self,
fileids,
strip=True,
unwrap=True,
encoding='utf8',
errors='strict',
unicode_fields=None,
):
return concat(
[
list(
ToolboxData(fileid, enc).fields(
strip, unwrap, encoding, errors, unicode_fields
)
)
for (fileid, enc) in self.abspaths(fileids, include_encoding=True)
]
)
# should probably be done lazily:
def entries(self, fileids, **kwargs):
if 'key' in kwargs:
key = kwargs['key']
del kwargs['key']
else:
key = 'lx' # the default key in MDF
entries = []
for marker, contents in self.fields(fileids, **kwargs):
if marker == key:
entries.append((contents, []))
else:
try:
entries[-1][-1].append((marker, contents))
except IndexError:
pass
return entries
def words(self, fileids, key='lx'):
return [contents for marker, contents in self.fields(fileids) if marker == key]
def raw(self, fileids):
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def demo():
pass
if __name__ == '__main__':
demo()

View File

@@ -0,0 +1,153 @@
# Natural Language Toolkit: Twitter Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
A reader for corpora that consist of Tweets. It is assumed that the Tweets
have been serialised into line-delimited JSON.
"""
import json
import os
from six import string_types
from nltk.tokenize import TweetTokenizer
from nltk.corpus.reader.util import StreamBackedCorpusView, concat, ZipFilePathPointer
from nltk.corpus.reader.api import CorpusReader
class TwitterCorpusReader(CorpusReader):
"""
Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.
Individual Tweets can be tokenized using the default tokenizer, or by a
custom tokenizer specified as a parameter to the constructor.
Construct a new Tweet corpus reader for a set of documents
located at the given root directory.
If you made your own tweet collection in a directory called
`twitter-files`, then you can initialise the reader as::
from nltk.corpus import TwitterCorpusReader
reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')
However, the recommended approach is to set the relevant directory as the
value of the environmental variable `TWITTER`, and then invoke the reader
as follows::
root = os.environ['TWITTER']
reader = TwitterCorpusReader(root, '.*\.json')
If you want to work directly with the raw Tweets, the `json` library can
be used::
import json
for tweet in reader.docs():
print(json.dumps(tweet, indent=1, sort_keys=True))
"""
CorpusView = StreamBackedCorpusView
"""
The corpus view class used by this reader.
"""
def __init__(
self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding='utf8'
):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param word_tokenizer: Tokenizer for breaking the text of Tweets into
smaller units, including but not limited to words.
"""
CorpusReader.__init__(self, root, fileids, encoding)
for path in self.abspaths(self._fileids):
if isinstance(path, ZipFilePathPointer):
pass
elif os.path.getsize(path) == 0:
raise ValueError("File {} is empty".format(path))
"""Check that all user-created corpus files are non-empty."""
self._word_tokenizer = word_tokenizer
def docs(self, fileids=None):
"""
Returns the full Tweet objects, as specified by `Twitter
documentation on Tweets
<https://dev.twitter.com/docs/platform-objects/tweets>`_
:return: the given file(s) as a list of dictionaries deserialised
from JSON.
:rtype: list(dict)
"""
return concat(
[
self.CorpusView(path, self._read_tweets, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def strings(self, fileids=None):
"""
Returns only the text content of Tweets in the file(s)
:return: the given file(s) as a list of Tweets.
:rtype: list(str)
"""
fulltweets = self.docs(fileids)
tweets = []
for jsono in fulltweets:
try:
text = jsono['text']
if isinstance(text, bytes):
text = text.decode(self.encoding)
tweets.append(text)
except KeyError:
pass
return tweets
def tokenized(self, fileids=None):
"""
:return: the given file(s) as a list of the text content of Tweets as
as a list of words, screenanames, hashtags, URLs and punctuation symbols.
:rtype: list(list(str))
"""
tweets = self.strings(fileids)
tokenizer = self._word_tokenizer
return [tokenizer.tokenize(t) for t in tweets]
def raw(self, fileids=None):
"""
Return the corpora in their raw form.
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, string_types):
fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
def _read_tweets(self, stream):
"""
Assumes that each line in ``stream`` is a JSON-serialised object.
"""
tweets = []
for i in range(10):
line = stream.readline()
if not line:
return tweets
tweet = json.loads(line)
tweets.append(tweet)
return tweets

Some files were not shown because too many files have changed in this diff Show More