### ANLP ASSIGNMENT 2 CODE ### Andrew T. Fiore ### September 2004 from nltk.parser.chunk import * from nltk.probability import ConditionalFreqDist #### ## Following functions from Marti's sample code and the NLTK tutorial #### # Grab the treebank sentences that contain the given word, numdocs documents # Return in a list # This is not very efficient; would be best to grab all the docs' tokens once # and reuse. Here I free up the memory after using it. def find_treebank_sents_counts(numdocs, word): from nltk.corpus import treebank sents = [] docs = [treebank.read(item) for item in treebank.items('tagged')[0:numdocs]] for doc in docs: for sent in doc['SENTS']: for tok in sent['WORDS']: if tok['TEXT'] == word: sents.append(sent) del docs # free up the memory return sents # create a conditional frequency distribution for treebank words # a special twist: group all similar POS tags together, e.g., all verbs are 'VB' # user passes in how many docs to process def find_treebank_counts(numdocs): from nltk.corpus import treebank cfd = ConditionalFreqDist() docs = [treebank.read(item) for item in treebank.items('tagged')[0:numdocs]] for doc in docs: for sent in doc['SENTS']: for tok in sent['WORDS']: tag = tok['POS'] if len(tag) > 2: tag = tag[0:2] #group all Nouns, Verbs, etc cfd[tok['TEXT']].inc(tag) return cfd # find the most frequent words with a given POS def find_common_pos(num, pos, cfd): result= [(cfd[word].count(pos), word) for word in cfd.conditions()] result.sort(); result.reverse() return result[0:num] # taken from the tutorial def eval_chunker(): from nltk.corpus import treebank rule = ChunkRule('+', "Chunk sequences of DT, JJ, and NN") chunkparser = RegexpChunkParser([rule], chunk_node='NP', top_node='S', TAG='POS', SUBTOKENS='WORDS') chunkscore = ChunkScore() for item in treebank.items('tagged')[:10]: for sent in treebank.read(item, add_locs=True)['SENTS']: test_sent = Token(WORDS = sent['TREE'].leaves()) chunkparser.parse(test_sent) chunkscore.score(sent['TREE'], test_sent['TREE']) print chunkscore def chunk_sentences_base(sents): npp = create_np_parser() ppp = create_pp_parser() vpp = create_vp_parser() for sent in sents: npp.parse(sent) ppp.parse(sent) vpp.parse(sent) return sents def chunk_sentences_atf(sents): npp = create_np_parser_atf() ppp = create_pp_parser_atf() vpp = create_vp_parser_atf() for sent in sents: npp.parse(sent) ppp.parse(sent) vpp.parse(sent) return sents ### ORIGINAL RULES def create_np_parser (): rule = ChunkRule(r'(
??)?*(<,>)*()+', 'Chunk NPs') parser = RegexpChunkParser([rule], chunk_node='NP', # the name to assign to matched substrings (optional) TAG='POS', # where to find the POS tags top_node='S', # the name to assign the top tree node (optional) SUBTOKENS='WORDS', # which set of tokens to use as input TREE='NP-CHUNKS') # the name for the output tokens of this parse (optional) return parser # Label PPs with NP-CHUNKS (overwriting earlier NP-CHUNKS) so output # will work with VP-chunks def create_pp_parser (): rule = ChunkRule(r'()+', 'Chunk PPs') parser = RegexpChunkParser([rule], chunk_node = 'PP', TAG='POS', top_node='S', SUBTOKENS='NP-CHUNKS', TREE='NP-CHUNKS') return parser # VP parser uses the tokens called NP-CHUNKS as input # It produces as output a new token called VP-CHUNKS def create_vp_parser (): rule = ChunkRule(r'?()+()?(|)+', 'Chunk VPs') parser = RegexpChunkParser([rule], chunk_node='VP', TAG='POS', top_node='S', SUBTOKENS='NP-CHUNKS', TREE='VP-CHUNKS') return parser ### ANDREW'S NEW RULES # # rule1 = ChunkRule(r'(
??)?((<,>)?)*()+(<,>(
??)?((<,>)?)*()+<,>)?', 'Chunk NPs') def create_np_parser_atf (): rule1 = ChunkRule(r'(
??)?((|((<,>)?)))*()+', 'Chunk NPs') parser = RegexpChunkParser([rule1], chunk_node='NP', # the name to assign to matched substrings (optional) TAG='POS', # where to find the POS tags top_node='S', # the name to assign the top tree node (optional) SUBTOKENS='WORDS', # which set of tokens to use as input TREE='NP-CHUNKS') # the name for the output tokens of this parse (optional) return parser # Label PPs with NP-CHUNKS (overwriting earlier NP-CHUNKS) so output # will work with VP-chunks def create_pp_parser_atf (): rule = ChunkRule(r'()+(((<,>)?)*(<,>)?)?', 'Chunk PPs') parser = RegexpChunkParser([rule], chunk_node = 'PP', TAG='POS', top_node='S', SUBTOKENS='NP-CHUNKS', TREE='PP-CHUNKS') return parser # VP parser uses the tokens called NP-CHUNKS as input # It produces as output a new token called VP-CHUNKS def create_vp_parser_atf (): rule = ChunkRule(r'()?()+()?()?()*()?(|)*()?()?', 'Chunk VPs') parser = RegexpChunkParser([rule], chunk_node='VP', TAG='POS', top_node='S', SUBTOKENS='PP-CHUNKS', TREE='VP-CHUNKS') return parser # Load data from the Treebank containing a form of "help" numdocs = 100 sents_base = find_treebank_sents_counts(numdocs, 'help') sents_base = sents_base + find_treebank_sents_counts(numdocs, 'helps') sents_base = sents_base + find_treebank_sents_counts(numdocs, 'helping') sents_base = sents_base + find_treebank_sents_counts(numdocs, 'helped') sents_atf = find_treebank_sents_counts(numdocs, 'help') sents_atf = sents_atf + find_treebank_sents_counts(numdocs, 'helps') sents_atf = sents_atf + find_treebank_sents_counts(numdocs, 'helping') sents_atf = sents_atf + find_treebank_sents_counts(numdocs, 'helped') len(sents_base) len(sents_atf) # Chunk the collected sentences sents_base = chunk_sentences_base(sents_base) sents_atf = chunk_sentences_atf(sents_atf) # Compare my chunking with the baseline RegExps provided with assignment def sentcomp(i): print("Base:") print sents_base[i]['VP-CHUNKS'] print("\nAndrew:") print sents_atf[i]['VP-CHUNKS'] return # Gather the VPs with some form of "help" help_vp = [] for sent in sents_atf: for chunk in sent['VP-CHUNKS']: if 'node' in dir(chunk): if chunk.node == 'VP': has_help = 0 # check whether "help" occurs as verb in this chunk for token in chunk.leaves(): if token['TEXT'].lower().startswith('help'): if token['POS'].startswith('VB'): has_help = 1 if (has_help == 1): help_vp.append(chunk) # extract all the VPs with "help" as untagged strings help_str = [] for vp in help_vp: my_str = '' for token in vp.leaves(): my_str = my_str + token['TEXT'] + ' ' help_str.append(my_str) # extract all sentences with "help" as untagged strings help_sen = [] for sen in sents_atf: my_str = '' for token in sen["VP-CHUNKS"].leaves(): my_str = my_str + token['TEXT'] + ' ' help_sen.append(my_str)