# Code for using the NLTK chunking package
# Marti A. Hearst
# September, 2004
from nltk.parser.chunk import *
from nltk.probability import *
def create_cat_list ():
the = Token(TEXT='the', POS='DT')
little = Token(TEXT='little', POS='JJ')
cat = Token(TEXT='cat', POS='NN')
sat = Token(TEXT='sat', POS='VBD')
on = Token(TEXT='on', POS='IN')
the = Token(TEXT='the', POS='DT')
mat = Token(TEXT='mat', POS='NN')
return Token(WORDS=[the, little, cat, sat, on, the, mat])
def create_cat_list2 ():
the = Token(TEXT='the', POS='DT')
cute = Token(TEXT= 'cute', POS='JJ')
comma = Token(TEXT=',', POS=',')
little = Token(TEXT='little', POS='JJ')
cat = Token(TEXT='cat', POS='NNS')
sat = Token(TEXT='sat', POS='VBD')
on = Token(TEXT='on', POS='IN')
mat = Token(TEXT='mat', POS='NNS')
inn = Token(TEXT='in', POS='IN')
house = Token(TEXT='house', POS='NNS')
return Token(WORDS=[the, cute, comma, little, cat, sat, on, the, mat, inn, the, house])
def create_np_parser ():
#rule = ChunkRule(r'(
??)?*(<,>)*()+', 'Chunk NPs')
#rule = ChunkRule(r'(??)?*(<,>)*()+', 'Chunk NPs')
quotationMarksRule = ChunkRule(r'(??)?<``>*(<,>)*()+<\'\'>', \
'Chunk quoted NPs')
rule = ChunkRule(r'(??)?*(<,>)*()+', 'Chunk NPs')
posRule1 = ChunkRule(r'', 'Chunk possessives')
posRule2 = MergeRule(r'<.*>', r'', 'Include possessives with preceding NPs')
posRule3 = MergeRule(r'', r'<.*>', 'Include possessives with proceeding NPs')
# Found a better way to do quotation marks
#rule2 = ChunkRule(r'?<``>|<\'\'>', 'Chunk quotation marks')
#rule3 = MergeRule(r'?<``>', r'<.*>', 'Include opening quotation marks in NPs')
#rule4 = MergeRule(r'<.*>', r'<\'\'>', 'Include ending quotation marks in NPs')
#rule5 = UnChunkRule(r'?<``>|<\'\'>', 'Unchunk isolated quotation marks')
# (BAD) rule = ChunkRule(r'(??)?(<,>?)*()+', 'Chunk NPs')
#parser = RegexpChunkParser([rule, posRule1, posRule2, posRule3, rule2, rule3, rule4, rule5],
parser = RegexpChunkParser([quotationMarksRule, rule, posRule1, posRule2, posRule3],
chunk_node='NP', # the name to assign to matched substrings (optional)
TAG='POS', # where to find the POS tags
top_node='S', # the name to assign the top tree node (optional)
SUBTOKENS='WORDS', # which set of tokens to use as input
TREE='NP-CHUNKS') # the name for the output tokens of this parse (optional)
return parser
# Label PPs with NP-CHUNKS (overwriting earlier NP-CHUNKS) so output
# will work with VP-chunks
def create_pp_parser ():
rule = ChunkRule(r'()+', 'Chunk PPs')
parser = RegexpChunkParser([rule], chunk_node = 'PP', TAG='POS', top_node='S',
SUBTOKENS='NP-CHUNKS', TREE='NP-CHUNKS')
return parser
def createSecondPassNPParser():
rule = ChunkRule(r'', 'Merge into ')
parser = RegexpChunkParser([rule], chunk_node = 'NP', TAG='POS', top_node='S',
SUBTOKENS = 'NP-CHUNKS', TREE='NP-CHUNKS')
return parser
# VP parser uses the tokens called NP-CHUNKS as input
# It produces as output a new token called VP-CHUNKS
def create_vp_parser ():
#rule = ChunkRule(r'?()+(