# Code for using the NLTK chunking package # Marti A. Hearst # September, 2004 from nltk.parser.chunk import * from nltk.probability import * def create_cat_list (): the = Token(TEXT='the', POS='DT') little = Token(TEXT='little', POS='JJ') cat = Token(TEXT='cat', POS='NN') sat = Token(TEXT='sat', POS='VBD') on = Token(TEXT='on', POS='IN') the = Token(TEXT='the', POS='DT') mat = Token(TEXT='mat', POS='NN') return Token(WORDS=[the, little, cat, sat, on, the, mat]) def create_cat_list2 (): the = Token(TEXT='the', POS='DT') cute = Token(TEXT= 'cute', POS='JJ') comma = Token(TEXT=',', POS=',') little = Token(TEXT='little', POS='JJ') cat = Token(TEXT='cat', POS='NNS') sat = Token(TEXT='sat', POS='VBD') on = Token(TEXT='on', POS='IN') mat = Token(TEXT='mat', POS='NNS') inn = Token(TEXT='in', POS='IN') house = Token(TEXT='house', POS='NNS') return Token(WORDS=[the, cute, comma, little, cat, sat, on, the, mat, inn, the, house]) def create_np_parser (): #rule = ChunkRule(r'(

??)?*(<,>)*()+', 'Chunk NPs') #rule = ChunkRule(r'(

??)?*(<,>)*()+', 'Chunk NPs') quotationMarksRule = ChunkRule(r'(??)?<``>*(<,>)*()+<\'\'>', \ 'Chunk quoted NPs') rule = ChunkRule(r'(??)?*(<,>)*()+', 'Chunk NPs') posRule1 = ChunkRule(r'', 'Chunk possessives') posRule2 = MergeRule(r'<.*>', r'', 'Include possessives with preceding NPs') posRule3 = MergeRule(r'', r'<.*>', 'Include possessives with proceeding NPs') # Found a better way to do quotation marks #rule2 = ChunkRule(r'

?<``>|<\'\'>', 'Chunk quotation marks') #rule3 = MergeRule(r'

?<``>', r'<.*>', 'Include opening quotation marks in NPs') #rule4 = MergeRule(r'<.*>', r'<\'\'>', 'Include ending quotation marks in NPs') #rule5 = UnChunkRule(r'

?<``>|<\'\'>', 'Unchunk isolated quotation marks') # (BAD) rule = ChunkRule(r'(

??)?(<,>?)*()+', 'Chunk NPs') #parser = RegexpChunkParser([rule, posRule1, posRule2, posRule3, rule2, rule3, rule4, rule5], parser = RegexpChunkParser([quotationMarksRule, rule, posRule1, posRule2, posRule3], chunk_node='NP', # the name to assign to matched substrings (optional) TAG='POS', # where to find the POS tags top_node='S', # the name to assign the top tree node (optional) SUBTOKENS='WORDS', # which set of tokens to use as input TREE='NP-CHUNKS') # the name for the output tokens of this parse (optional) return parser # Label PPs with NP-CHUNKS (overwriting earlier NP-CHUNKS) so output # will work with VP-chunks def create_pp_parser (): rule = ChunkRule(r'()+', 'Chunk PPs') parser = RegexpChunkParser([rule], chunk_node = 'PP', TAG='POS', top_node='S', SUBTOKENS='NP-CHUNKS', TREE='NP-CHUNKS') return parser def createSecondPassNPParser(): rule = ChunkRule(r'', 'Merge into ') parser = RegexpChunkParser([rule], chunk_node = 'NP', TAG='POS', top_node='S', SUBTOKENS = 'NP-CHUNKS', TREE='NP-CHUNKS') return parser # VP parser uses the tokens called NP-CHUNKS as input # It produces as output a new token called VP-CHUNKS def create_vp_parser (): #rule = ChunkRule(r'?()+()?(|)+', 'Chunk VPs') #rule = ChunkRule(r'?()+()?()*(||)+', 'Chunk VPs') rule = ChunkRule(r'?()?()+()?()?(||)+', 'Chunk VPs') parser = RegexpChunkParser([rule], chunk_node='VP', TAG='POS', top_node='S', SUBTOKENS='NP-CHUNKS', TREE='VP-CHUNKS') return parser def chunk_sentences(sents): npp = create_np_parser() ppp = create_pp_parser() npp2 = createSecondPassNPParser() vpp = create_vp_parser() for sent in sents: npp.parse(sent) ppp.parse(sent) npp2.parse(sent) vpp.parse(sent) return sents # Grab the treebank sentences that contain the given word, numdocs documents # Return in a list # This is not very efficient; would be best to grab all the docs' tokens once # and reuse. Here I free up the memory after using it. def find_treebank_sents_counts(numdocs, word): from nltk.corpus import treebank sents = [] docs = [treebank.read(item) for item in treebank.items('tagged')[0:numdocs]] for doc in docs: for sent in doc['SENTS']: for tok in sent['WORDS']: if tok['TEXT'] == word: sents.append(sent) del docs # free up the memory return sents # create a conditional frequency distribution for treebank words # a special twist: group all similar POS tags together, e.g., all verbs are 'VB' # user passes in how many docs to process def find_treebank_counts(numdocs): from nltk.corpus import treebank cfd = ConditionalFreqDist() docs = [treebank.read(item) for item in treebank.items('tagged')[0:numdocs]] for doc in docs: for sent in doc['SENTS']: for tok in sent['WORDS']: tag = tok['POS'] if len(tag) > 2: tag = tag[0:2] #group all Nouns, Verbs, etc cfd[tok['TEXT']].inc(tag) return cfd # find the most frequent words with a given POS def find_common_pos(num, pos, cfd): result= [(cfd[word].count(pos), word) for word in cfd.conditions()] result.sort(); result.reverse() return result[0:num] # taken from the tutorial def eval_chunker(): from nltk.corpus import treebank rule = ChunkRule('+', "Chunk sequences of DT, JJ, and NN") chunkparser = RegexpChunkParser([rule], chunk_node='NP', top_node='S', TAG='POS', SUBTOKENS='WORDS') chunkscore = ChunkScore() for item in treebank.items('tagged')[:10]: for sent in treebank.read(item, add_locs=True)['SENTS']: test_sent = Token(WORDS = sent['TREE'].leaves()) chunkparser.parse(test_sent) chunkscore.score(sent['TREE'], test_sent['TREE']) print chunkscore def loadSentences(): # First we want to get all the sentences containing our chosen verb. # Let's use the verb say, which has the conjugated forms: said, saying, says sentences = find_treebank_sents_counts(99, 'say') sentences.extend(find_treebank_sents_counts(99, 'said')) sentences.extend(find_treebank_sents_counts(99, 'saying')) sentences.extend(find_treebank_sents_counts(99, 'says')) print 'Using %d sentences from the treebank.' % len(sentences) return sentences # To copy sentences make sure to do something like the following: # >>> import copy # >>> oldSentences = copy.deepcopy(sentences) def getIndicesOfChangedSentences(oldSentences,newSentences): indices = [] for index in range(0,len(oldSentences)-1): #print index if (oldSentences[index]['VP-CHUNKS'] != newSentences[index]['VP-CHUNKS']): indices.append(index) return indices def assignment2(): originalSentences = loadSentences() # Now let's chunk the sentences # We can examine the chunked sentences using: # print sentences[1]['VP-CHUNKS'] sentences = chunk_sentences(originalSentences); # Let's collect statistics about the words that precede and follow our verb previousWordFreqDist = FreqDist() previousTagFreqDist = FreqDist() nextWordFreqDist = FreqDist() nextTagFreqDist = FreqDist() nextPhraseFreqDist = FreqDist() for sentence in sentences: tokens = sentence['VP-CHUNKS'].leaves() length = len(tokens) index = getIndexOfSay(tokens) if (index - 1) < 0: previousWordFreqDist.inc('BEGIN') previousTagFreqDist.inc('BEGIN') else: previousWordFreqDist.inc(tokens[index-1]['TEXT']) previousTagFreqDist.inc(tokens[index-1]['POS']) if (index + 1) >= length: nextWordFreqDist.inc('END') nextTagFreqDist.inc('END') else: nextWordFreqDist.inc(tokens[index+1]['TEXT']) nextTagFreqDist.inc(tokens[index+1]['POS']) nextPhraseFreqDist.inc(getProceedingPhraseType(sentence['VP-CHUNKS'])) print 'Previous words:' printSortedSamples(previousWordFreqDist) print 'Previous tags:' printSortedSamples(previousTagFreqDist) print 'Next words:' printSortedSamples(nextWordFreqDist) print 'Next tags:' printSortedSamples(nextTagFreqDist) print 'Next phrase:' printSortedSamples(nextPhraseFreqDist) def printSortedSamples(fd): sortedSamples = fd.sorted_samples() length = min(9, len(sortedSamples)) for index in range(0, length): print '%16s %d' % (sortedSamples[index], fd.count(sortedSamples[index])) def getIndexOfSay(tokens): say = 'say' said = 'said' saying = 'saying' says = 'says' for index in range(0,len(tokens)-1): text = tokens[index]['TEXT'] if ((text == say) | (text == said) | (text == saying) | (text == says)): return index return -1 def getProceedingPhraseType(sentence): tok = Token(TEXT='', POS='') tree = Tree('', []) say = 'say' said = 'said' saying = 'saying' says = 'says' length = len(sentence) for index in range(0,length-1): child = sentence[index] if (type(child) == type(tok)): text = child['TEXT'] if ((text == say) | (text == said) | (text == saying) | (text == says)): if (index+1) <= (length-1): sibling = sentence[index+1] if (type(sibling) == type(tok)): return sibling['POS'] else: return sibling.node else: return 'END_OF_VERB_PHRASE' if (type(child) == type(tree)): returnedValue = getProceedingPhraseType(child) if returnedValue != -1: return returnedValue return -1 # Here I will document all the changes I made to the chunking rules. For each change, # I will show a before and after example of a sentence or phrase. # (BAD) Rule: For conciseness, I changed *(<,>)* to (<,>?)* # However, this tended to make things worse! # It turns out we don't want to get sentences like "... CD, NP" # Before: # # <1953/CD> # # <1955/CD> # <,/,> # (NP: <9.8/CD> ) # After: # # <1953/CD> # # (NP: # <1955/CD> # <,/,> # <9.8/CD> # # # ) # Rule: Second NP chunking pass that turns into # Before: # (NP: ) # (PP: (NP: )) # After: # (NP: # (NP: ) # (PP: (NP: ))) # Rule: Include surrounding quotation marks in NP chunks # Before: # # <``/``> # (NP: ) # <''/''> # After: # (NP: # # <``/``> # # # # <''/''>) # Rule: Allow possessives to join two NP chunks # Before: # (NP: ) # <'s/POS> # (NP: ) # After: # (NP: <'s/POS> ) # Rule: Allow adjectives after a verb # Before: # (NP: ) # # # # (PP: (NP: )) # After: # (NP: ) # (VP: # # # # (PP: (NP: ))) # Rule: Allow verb phrases with verbs in their infinitive form # Before: # # (NP: ) # # # # # After: # # (NP: ) # (VP: ) # Rule: Allow possessive pronouns to start noun phrases # Before: # # (NP: ) # After: # (NP: )