# Code for using the NLTK chunking package # Marti A. Hearst # September, 2004 from nltk.parser.chunk import * from nltk.probability import ConditionalFreqDist def create_cat_list (): the = Token(TEXT='the', POS='DT') little = Token(TEXT='little', POS='JJ') cat = Token(TEXT='cat', POS='NN') sat = Token(TEXT='sat', POS='VBD') on = Token(TEXT='on', POS='IN') the = Token(TEXT='the', POS='DT') mat = Token(TEXT='mat', POS='NN') return Token(WORDS=[the, little, cat, sat, on, the, mat]) def create_cat_list2 (): the = Token(TEXT='the', POS='DT') cute = Token(TEXT= 'cute', POS='JJ') comma = Token(TEXT=',', POS=',') little = Token(TEXT='little', POS='JJ') cat = Token(TEXT='cat', POS='NNS') sat = Token(TEXT='sat', POS='VBD') on = Token(TEXT='on', POS='IN') mat = Token(TEXT='mat', POS='NNS') inn = Token(TEXT='in', POS='IN') house = Token(TEXT='house', POS='NNS') return Token(WORDS=[the, cute, comma, little, cat, sat, on, the, mat, inn, the, house]) def create_np_parser (): rule = ChunkRule(r'(
??)?*(<,>)*()+', 'Chunk NPs') parser = RegexpChunkParser([rule], chunk_node='NP', # the name to assign to matched substrings (optional) TAG='POS', # where to find the POS tags top_node='S', # the name to assign the top tree node (optional) SUBTOKENS='WORDS', # which set of tokens to use as input TREE='NP-CHUNKS') # the name for the output tokens of this parse (optional) return parser # Label PPs with NP-CHUNKS (overwriting earlier NP-CHUNKS) so output # will work with VP-chunks def create_pp_parser (): rule = ChunkRule(r'()+', 'Chunk PPs') parser = RegexpChunkParser([rule], chunk_node = 'PP', TAG='POS', top_node='S', SUBTOKENS='NP-CHUNKS', TREE='NP-CHUNKS') return parser # VP parser uses the tokens called NP-CHUNKS as input # It produces as output a new token called VP-CHUNKS def create_vp_parser (): rule = ChunkRule(r'?()+()?(|)+', 'Chunk VPs') parser = RegexpChunkParser([rule], chunk_node='VP', TAG='POS', top_node='S', SUBTOKENS='NP-CHUNKS', TREE='VP-CHUNKS') return parser def chunk_sentences(sents): npp = create_np_parser() ppp = create_pp_parser() vpp = create_vp_parser() for sent in sents: npp.parse(sent) ppp.parse(sent) vpp.parse(sent) return sents # Grab the treebank sentences that contain the given word, numdocs documents # Return in a list # This is not very efficient; would be best to grab all the docs' tokens once # and reuse. Here I free up the memory after using it. def find_treebank_sents_counts(numdocs, word): from nltk.corpus import treebank sents = [] docs = [treebank.read(item) for item in treebank.items('tagged')[0:numdocs]] for doc in docs: for sent in doc['SENTS']: for tok in sent['WORDS']: if tok['TEXT'] == word: sents.append(sent) del docs # free up the memory return sents # create a conditional frequency distribution for treebank words # a special twist: group all similar POS tags together, e.g., all verbs are 'VB' # user passes in how many docs to process def find_treebank_counts(numdocs): from nltk.corpus import treebank cfd = ConditionalFreqDist() docs = [treebank.read(item) for item in treebank.items('tagged')[0:numdocs]] for doc in docs: for sent in doc['SENTS']: for tok in sent['WORDS']: tag = tok['POS'] if len(tag) > 2: tag = tag[0:2] #group all Nouns, Verbs, etc cfd[tok['TEXT']].inc(tag) return cfd # find the most frequent words with a given POS def find_common_pos(num, pos, cfd): result= [(cfd[word].count(pos), word) for word in cfd.conditions()] result.sort(); result.reverse() return result[0:num] # taken from the tutorial def eval_chunker(): from nltk.corpus import treebank rule = ChunkRule('+', "Chunk sequences of DT, JJ, and NN") chunkparser = RegexpChunkParser([rule], chunk_node='NP', top_node='S', TAG='POS', SUBTOKENS='WORDS') chunkscore = ChunkScore() for item in treebank.items('tagged')[:10]: for sent in treebank.read(item, add_locs=True)['SENTS']: test_sent = Token(WORDS = sent['TREE'].leaves()) chunkparser.parse(test_sent) chunkscore.score(sent['TREE'], test_sent['TREE']) print chunkscore