# Code for using the NLTK chunking package
# Marti A. Hearst
# September, 2004
from nltk.parser.chunk import *
from nltk.probability import ConditionalFreqDist
def create_cat_list ():
the = Token(TEXT='the', POS='DT')
little = Token(TEXT='little', POS='JJ')
cat = Token(TEXT='cat', POS='NN')
sat = Token(TEXT='sat', POS='VBD')
on = Token(TEXT='on', POS='IN')
the = Token(TEXT='the', POS='DT')
mat = Token(TEXT='mat', POS='NN')
return Token(WORDS=[the, little, cat, sat, on, the, mat])
def create_cat_list2 ():
the = Token(TEXT='the', POS='DT')
cute = Token(TEXT= 'cute', POS='JJ')
comma = Token(TEXT=',', POS=',')
little = Token(TEXT='little', POS='JJ')
cat = Token(TEXT='cat', POS='NNS')
sat = Token(TEXT='sat', POS='VBD')
on = Token(TEXT='on', POS='IN')
mat = Token(TEXT='mat', POS='NNS')
inn = Token(TEXT='in', POS='IN')
house = Token(TEXT='house', POS='NNS')
return Token(WORDS=[the, cute, comma, little, cat, sat, on, the, mat, inn, the, house])
def create_np_parser ():
rule = ChunkRule(r'(
??)?*(<,>)*()+', 'Chunk NPs')
parser = RegexpChunkParser([rule],
chunk_node='NP', # the name to assign to matched substrings (optional)
TAG='POS', # where to find the POS tags
top_node='S', # the name to assign the top tree node (optional)
SUBTOKENS='WORDS', # which set of tokens to use as input
TREE='NP-CHUNKS') # the name for the output tokens of this parse (optional)
return parser
# Label PPs with NP-CHUNKS (overwriting earlier NP-CHUNKS) so output
# will work with VP-chunks
def create_pp_parser ():
rule = ChunkRule(r'()+', 'Chunk PPs')
parser = RegexpChunkParser([rule], chunk_node = 'PP', TAG='POS', top_node='S',
SUBTOKENS='NP-CHUNKS', TREE='NP-CHUNKS')
return parser
# VP parser uses the tokens called NP-CHUNKS as input
# It produces as output a new token called VP-CHUNKS
def create_vp_parser ():
rule = ChunkRule(r'?()+(