#! /usr/bin/env python

from nltk.parser.chunk import *
from nltk.probability import ConditionalFreqDist

def find_sentences_using_verb(documents, conjugation):
    """Given a list of treebank documents, returns a list of sentences
    tagged with variations of a given verb."""
    sentences = []
    for document in documents:
        for sentence in document['SENTS']:
            for token in sentence['WORDS']:
                if (token['POS'].startswith('VB') and
                    token['TEXT'] in conjugation):
                    sentences.append(sentence)
    return sentences

class HackedChunkRule(ChunkRule):
    """Modified to take raw regular expressions as I specify them, so
    that it is possible to use lookbehind assertions (which require
    the < character)."""
    def __init__(self, tag_pattern, descr):
        assert chktype(1, tag_pattern, types.StringType)
        assert chktype(2, descr, types.StringType)
        self._pattern = tag_pattern
        regexp = re.compile('(?P<chunk>%s)%s' %
                            (tag_pattern,
                             ChunkString.IN_CHINK_PATTERN))
        RegexpChunkParserRule.__init__(self, regexp, '{\g<chunk>}', descr)

def create_parser_1():
    """This parser operates on unchunked word tokens and produces NP chunks."""
    rules = [
        ChunkRule(r'<\$><CD><-NONE-><DT><NN>',
                  'Share prices'),
        ChunkRule(r'<DT>?(<JJ.*>|<CD>|<VBN>|<NNP>)*<NN.*>+',
                  'Chunk nouns and their modifiers'),
        ChunkRule(r'<DT>?<RB>?(<JJ.*>|<CD>)+<VBG>+',
                  'Chunk gerunds and their modifiers'),
        ChunkRule(r'<\$><CD>+',
                  'Chunk currency values'),
        ChunkRule(r'<CD>+',
                  'Chunk numbers (like years)'),
        ChunkRule(r'<PRP>',
                  'Chunk personal pronouns')]
    return RegexpChunkParser(rules,
                             chunk_node='NP',
                             TAG='POS',
                             SUBTOKENS='WORDS',
                             TREE='MY-TREE')

def create_parser_2():
    """This parser operates on NP chunks and produces NP chunks."""
    rules = [
        HackedChunkRule(r'(?<!<VBP>)<VBG><NP>',
                  'Chunk present participles modifying NPs'),
        ChunkRule(r'<NP>((<,>|<CC>|<,><CC>)<NP>)*<CC><NP>',
                  'Chunk NPs joined by conjunctions'),
        ChunkRule(r'<NP><POS><NP>',
                  'Chunk possessives'),
        ChunkRule(r'<PRP\$><NP>',
                  'Chunk possessive pronouns')]
    return RegexpChunkParser(rules,
                             chunk_node='NP',
                             TAG='POS',
                             SUBTOKENS='MY-TREE',
                             TREE='MY-TREE')

def create_parser_3():
    """This parser operates on NP chunks and produces PP chunks."""
    rules = [
        ChunkRule(r'(<IN>|<TO>)+<NP>',
                  'Chunk PPs')]
    parser = RegexpChunkParser(rules,
                               chunk_node = 'PP',
                               TAG='POS',
                               top_node='S',
                               SUBTOKENS='MY-TREE',
                               TREE='MY-TREE')
    return parser

def create_parser_4():
    """This parser operates on NP and PP chunks and produces NP chunks."""
    rules = [
        ChunkRule(r'<NP><PP>',
                  'Chunk NP-PP combos into NPs')]
    parser = RegexpChunkParser(rules,
                               chunk_node = 'NP',
                               TAG='POS',
                               top_node='S',
                               SUBTOKENS='MY-TREE',
                               TREE='MY-TREE')
    return parser

def create_parser_5():
    """This parser operates on NP and PP chunks and produces VP chunks."""
    rules = [
        ChunkRule(r'<VB.*><JJ.*>',
                  'Chunk verb-adjective combos'),
        ChunkRule(r'(<MD>|<TO>)?<RB>?(<VB.*>)+<-NONE->?<RB>?<RP>?(<NP>|<PP>)*<RB>?',
                  'Chunk VPs'),
        ChunkRule(r'<VB.*>',
                  'Chunk lone verbs')]
    parser = RegexpChunkParser(rules,
                               chunk_node='VP',
                               TAG='POS', 
                               top_node='S',
                               SUBTOKENS='MY-TREE',
                               TREE='MY-TREE')
    return parser

def create_parser_6():
    """This parser operates on VP chunks and produces VP chunks."""
    rules = [
        ChunkRule(r'<VP><CC><VP>',
                  'Chunk VPs joined by conjunctions')]
    parser = RegexpChunkParser(rules,
                               chunk_node='VP',
                               TAG='POS', 
                               top_node='S',
                               SUBTOKENS='MY-TREE',
                               TREE='MY-TREE')
    return parser

def chunk(sentences):
    parser_1 = create_parser_1()
    parser_2 = create_parser_2()
    parser_3 = create_parser_3()
    parser_4 = create_parser_4()
    parser_5 = create_parser_5()
    parser_6 = create_parser_6()
    parsers = [parser_1, parser_2, parser_2, parser_3, parser_4, parser_5, parser_6]
    for sentence in sentences:
          for parser in parsers:
              parser.parse(sentence, trace=0)

def print_subtrees(tree, pos):
    if tree.__str__().startswith('(' + pos):
        print tree
    elif isinstance(tree, Tree):
        for subtree in tree:
            print_subtrees(subtree, pos)

def show(sentence):
    text = []
    for token in sentence['WORDS']:
        if not token['TEXT'].startswith('*'):
            text.append(token['TEXT'])
    print ' '.join(text)

#-------------------------------------------------------------------------------

from nltk.corpus import treebank
from original_chunking import chunk_sentences

# Get sentences using forms of "rise."
conjugation = ['rise', 'rises', 'risen', 'rose', 'rising']
docs = [treebank.read(item) for item in treebank.items('tagged')]
sentences = find_sentences_using_verb(docs, conjugation)

# Chunk using original rules.
chunk_sentences(sentences)
# Chunk using my improved rules.
chunk(sentences)
# Show improvements.
for sentence in sentences:
    print '-- Sentence -------------------------------------------------------'
    show(sentence)
    print '-- Original chunking rules ----------------------------------------'
    print sentence['VP-CHUNKS']
    print '-- Improved chunking rules ----------------------------------------'
    print sentence['MY-TREE']
    print '-------------------------------------------------------------------'

## 'rising rates'
## 'rising wages'
## 'rising labor costs'
## 'the yield rose'
## 'sales rose' x3
## 'prices rose' x2
## 'revenue rose'
## 'the index rose'
## 'the dollar rose'
## 'it rose'
## 'stock rose'
## 'Valhi rose'
## 'payouts rose'
## 'shares of NESB rose'
## 'Jaguar's ADRs rose'
## 'backlogs of unfilled orders rose'
## 'construction spending rose'
## 'orders failed to rise'
## 'prices tend to rise'
## 'the Japanese contribution could rise'
## 'the ratings are rising'
## 'orders were up after rising'
## 'shipments fell after rising'
## '[the bank] saw its stock rise'
## 'payouts have sometimes risen'
## 'dividends have risen'
## 'the minimum wage would rise'
##
## rise (percentage) (to X) (from Y)
## adverbs: sharply, smartly, modestly, fractionally