### ANLP ASSIGNMENT 2 CODE
### Andrew T. Fiore
### September 2004
from nltk.parser.chunk import *
from nltk.probability import ConditionalFreqDist
####
## Following functions from Marti's sample code and the NLTK tutorial
####
# Grab the treebank sentences that contain the given word, numdocs documents
# Return in a list
# This is not very efficient; would be best to grab all the docs' tokens once
# and reuse. Here I free up the memory after using it.
def find_treebank_sents_counts(numdocs, word):
from nltk.corpus import treebank
sents = []
docs = [treebank.read(item) for item in treebank.items('tagged')[0:numdocs]]
for doc in docs:
for sent in doc['SENTS']:
for tok in sent['WORDS']:
if tok['TEXT'] == word:
sents.append(sent)
del docs # free up the memory
return sents
# create a conditional frequency distribution for treebank words
# a special twist: group all similar POS tags together, e.g., all verbs are 'VB'
# user passes in how many docs to process
def find_treebank_counts(numdocs):
from nltk.corpus import treebank
cfd = ConditionalFreqDist()
docs = [treebank.read(item) for item in treebank.items('tagged')[0:numdocs]]
for doc in docs:
for sent in doc['SENTS']:
for tok in sent['WORDS']:
tag = tok['POS']
if len(tag) > 2: tag = tag[0:2] #group all Nouns, Verbs, etc
cfd[tok['TEXT']].inc(tag)
return cfd
# find the most frequent words with a given POS
def find_common_pos(num, pos, cfd):
result= [(cfd[word].count(pos), word) for word in cfd.conditions()]
result.sort(); result.reverse()
return result[0:num]
# taken from the tutorial
def eval_chunker():
from nltk.corpus import treebank
rule = ChunkRule('
+', "Chunk sequences of DT, JJ, and NN")
chunkparser = RegexpChunkParser([rule], chunk_node='NP',
top_node='S', TAG='POS', SUBTOKENS='WORDS')
chunkscore = ChunkScore()
for item in treebank.items('tagged')[:10]:
for sent in treebank.read(item, add_locs=True)['SENTS']:
test_sent = Token(WORDS = sent['TREE'].leaves())
chunkparser.parse(test_sent)
chunkscore.score(sent['TREE'], test_sent['TREE'])
print chunkscore
def chunk_sentences_base(sents):
npp = create_np_parser()
ppp = create_pp_parser()
vpp = create_vp_parser()
for sent in sents:
npp.parse(sent)
ppp.parse(sent)
vpp.parse(sent)
return sents
def chunk_sentences_atf(sents):
npp = create_np_parser_atf()
ppp = create_pp_parser_atf()
vpp = create_vp_parser_atf()
for sent in sents:
npp.parse(sent)
ppp.parse(sent)
vpp.parse(sent)
return sents
### ORIGINAL RULES
def create_np_parser ():
rule = ChunkRule(r'(??)?*(<,>)*()+', 'Chunk NPs')
parser = RegexpChunkParser([rule],
chunk_node='NP', # the name to assign to matched substrings (optional)
TAG='POS', # where to find the POS tags
top_node='S', # the name to assign the top tree node (optional)
SUBTOKENS='WORDS', # which set of tokens to use as input
TREE='NP-CHUNKS') # the name for the output tokens of this parse (optional)
return parser
# Label PPs with NP-CHUNKS (overwriting earlier NP-CHUNKS) so output
# will work with VP-chunks
def create_pp_parser ():
rule = ChunkRule(r'()+', 'Chunk PPs')
parser = RegexpChunkParser([rule], chunk_node = 'PP', TAG='POS', top_node='S',
SUBTOKENS='NP-CHUNKS', TREE='NP-CHUNKS')
return parser
# VP parser uses the tokens called NP-CHUNKS as input
# It produces as output a new token called VP-CHUNKS
def create_vp_parser ():
rule = ChunkRule(r'?()+(