#! /usr/bin/env python from nltk.parser.chunk import * from nltk.probability import ConditionalFreqDist def find_sentences_using_verb(documents, conjugation): """Given a list of treebank documents, returns a list of sentences tagged with variations of a given verb.""" sentences = [] for document in documents: for sentence in document['SENTS']: for token in sentence['WORDS']: if (token['POS'].startswith('VB') and token['TEXT'] in conjugation): sentences.append(sentence) return sentences class HackedChunkRule(ChunkRule): """Modified to take raw regular expressions as I specify them, so that it is possible to use lookbehind assertions (which require the < character).""" def __init__(self, tag_pattern, descr): assert chktype(1, tag_pattern, types.StringType) assert chktype(2, descr, types.StringType) self._pattern = tag_pattern regexp = re.compile('(?P%s)%s' % (tag_pattern, ChunkString.IN_CHINK_PATTERN)) RegexpChunkParserRule.__init__(self, regexp, '{\g}', descr) def create_parser_1(): """This parser operates on unchunked word tokens and produces NP chunks.""" rules = [ ChunkRule(r'<\$><-NONE->
', 'Share prices'), ChunkRule(r'
?(|||)*+', 'Chunk nouns and their modifiers'), ChunkRule(r'
??(|)++', 'Chunk gerunds and their modifiers'), ChunkRule(r'<\$>+', 'Chunk currency values'), ChunkRule(r'+', 'Chunk numbers (like years)'), ChunkRule(r'', 'Chunk personal pronouns')] return RegexpChunkParser(rules, chunk_node='NP', TAG='POS', SUBTOKENS='WORDS', TREE='MY-TREE') def create_parser_2(): """This parser operates on NP chunks and produces NP chunks.""" rules = [ HackedChunkRule(r'(?)', 'Chunk present participles modifying NPs'), ChunkRule(r'((<,>||<,>))*', 'Chunk NPs joined by conjunctions'), ChunkRule(r'', 'Chunk possessives'), ChunkRule(r'', 'Chunk possessive pronouns')] return RegexpChunkParser(rules, chunk_node='NP', TAG='POS', SUBTOKENS='MY-TREE', TREE='MY-TREE') def create_parser_3(): """This parser operates on NP chunks and produces PP chunks.""" rules = [ ChunkRule(r'(|)+', 'Chunk PPs')] parser = RegexpChunkParser(rules, chunk_node = 'PP', TAG='POS', top_node='S', SUBTOKENS='MY-TREE', TREE='MY-TREE') return parser def create_parser_4(): """This parser operates on NP and PP chunks and produces NP chunks.""" rules = [ ChunkRule(r'', 'Chunk NP-PP combos into NPs')] parser = RegexpChunkParser(rules, chunk_node = 'NP', TAG='POS', top_node='S', SUBTOKENS='MY-TREE', TREE='MY-TREE') return parser def create_parser_5(): """This parser operates on NP and PP chunks and produces VP chunks.""" rules = [ ChunkRule(r'', 'Chunk verb-adjective combos'), ChunkRule(r'(|)??()+<-NONE->???(|)*?', 'Chunk VPs'), ChunkRule(r'', 'Chunk lone verbs')] parser = RegexpChunkParser(rules, chunk_node='VP', TAG='POS', top_node='S', SUBTOKENS='MY-TREE', TREE='MY-TREE') return parser def create_parser_6(): """This parser operates on VP chunks and produces VP chunks.""" rules = [ ChunkRule(r'', 'Chunk VPs joined by conjunctions')] parser = RegexpChunkParser(rules, chunk_node='VP', TAG='POS', top_node='S', SUBTOKENS='MY-TREE', TREE='MY-TREE') return parser def chunk(sentences): parser_1 = create_parser_1() parser_2 = create_parser_2() parser_3 = create_parser_3() parser_4 = create_parser_4() parser_5 = create_parser_5() parser_6 = create_parser_6() parsers = [parser_1, parser_2, parser_2, parser_3, parser_4, parser_5, parser_6] for sentence in sentences: for parser in parsers: parser.parse(sentence, trace=0) def print_subtrees(tree, pos): if tree.__str__().startswith('(' + pos): print tree elif isinstance(tree, Tree): for subtree in tree: print_subtrees(subtree, pos) def show(sentence): text = [] for token in sentence['WORDS']: if not token['TEXT'].startswith('*'): text.append(token['TEXT']) print ' '.join(text) #------------------------------------------------------------------------------- from nltk.corpus import treebank from original_chunking import chunk_sentences # Get sentences using forms of "rise." conjugation = ['rise', 'rises', 'risen', 'rose', 'rising'] docs = [treebank.read(item) for item in treebank.items('tagged')] sentences = find_sentences_using_verb(docs, conjugation) # Chunk using original rules. chunk_sentences(sentences) # Chunk using my improved rules. chunk(sentences) # Show improvements. for sentence in sentences: print '-- Sentence -------------------------------------------------------' show(sentence) print '-- Original chunking rules ----------------------------------------' print sentence['VP-CHUNKS'] print '-- Improved chunking rules ----------------------------------------' print sentence['MY-TREE'] print '-------------------------------------------------------------------' ## 'rising rates' ## 'rising wages' ## 'rising labor costs' ## 'the yield rose' ## 'sales rose' x3 ## 'prices rose' x2 ## 'revenue rose' ## 'the index rose' ## 'the dollar rose' ## 'it rose' ## 'stock rose' ## 'Valhi rose' ## 'payouts rose' ## 'shares of NESB rose' ## 'Jaguar's ADRs rose' ## 'backlogs of unfilled orders rose' ## 'construction spending rose' ## 'orders failed to rise' ## 'prices tend to rise' ## 'the Japanese contribution could rise' ## 'the ratings are rising' ## 'orders were up after rising' ## 'shipments fell after rising' ## '[the bank] saw its stock rise' ## 'payouts have sometimes risen' ## 'dividends have risen' ## 'the minimum wage would rise' ## ## rise (percentage) (to X) (from Y) ## adverbs: sharply, smartly, modestly, fractionally