# Code for using the NLTK chunking package
# Marti A. Hearst
# September, 2004

from nltk.parser.chunk import *
from nltk.probability import ConditionalFreqDist

def create_cat_list ():
    the    = Token(TEXT='the', POS='DT')
    little = Token(TEXT='little', POS='JJ')
    cat    = Token(TEXT='cat', POS='NN')
    sat    = Token(TEXT='sat', POS='VBD')
    on     = Token(TEXT='on', POS='IN')
    the    = Token(TEXT='the', POS='DT')
    mat    = Token(TEXT='mat', POS='NN')
    return Token(WORDS=[the, little, cat, sat, on, the, mat])

def create_cat_list2 ():
    the    = Token(TEXT='the', POS='DT')
    cute   = Token(TEXT= 'cute', POS='JJ')
    comma   = Token(TEXT=',', POS=',')
    little = Token(TEXT='little', POS='JJ')
    cat    = Token(TEXT='cat', POS='NNS')
    sat    = Token(TEXT='sat', POS='VBD')
    on     = Token(TEXT='on', POS='IN')
    mat    = Token(TEXT='mat', POS='NNS')
    inn     = Token(TEXT='in', POS='IN')
    house    = Token(TEXT='house', POS='NNS')
    
    return Token(WORDS=[the, cute, comma, little, cat, sat, on, the, mat, inn, the, house])
    

def create_np_parser ():
    rule = ChunkRule(r'(<DT>?<RB>?)?<JJ|CD>*(<JJ|CD><,>)*(<NN.*>)+', 'Chunk NPs')
    parser = RegexpChunkParser([rule],
            chunk_node='NP',   # the name to assign to matched substrings (optional)
            TAG='POS',         # where to find the POS tags
            top_node='S',      # the name to assign the top tree node  (optional)
            SUBTOKENS='WORDS', # which set of tokens to use as input
            TREE='NP-CHUNKS')  # the name for the output tokens of this parse (optional)

    return parser

# Label PPs with NP-CHUNKS (overwriting earlier NP-CHUNKS) so output
# will work with VP-chunks
def create_pp_parser ():
    rule = ChunkRule(r'(<IN>)+<NP>', 'Chunk PPs')
    parser = RegexpChunkParser([rule], chunk_node = 'PP', TAG='POS', top_node='S',
                               SUBTOKENS='NP-CHUNKS', TREE='NP-CHUNKS')

    return parser

# VP parser uses the tokens called NP-CHUNKS as input
# It produces as output a new token called VP-CHUNKS
def create_vp_parser ():
    rule = ChunkRule(r'<MD>?(<VB.*>)+(<RP>)?(<NP>|<PP>)+', 'Chunk VPs')
    parser = RegexpChunkParser([rule], chunk_node='VP', TAG='POS', 
                               top_node='S', SUBTOKENS='NP-CHUNKS', TREE='VP-CHUNKS')
    return parser

def chunk_sentences(sents):
    npp = create_np_parser()
    ppp = create_pp_parser()
    vpp = create_vp_parser()
    for sent in sents:
        npp.parse(sent)
        ppp.parse(sent)
        vpp.parse(sent)
    return sents

# Grab the treebank sentences that contain the given word, numdocs documents
# Return in a list
# This is not very efficient; would be best to grab all the docs' tokens once
# and reuse.  Here I free up the memory after using it.

def find_treebank_sents_counts(numdocs, word):
    from nltk.corpus import treebank
    sents = []
    docs = [treebank.read(item) for item in treebank.items('tagged')[0:numdocs]]

    for doc in docs:
        for sent in doc['SENTS']:
            for tok in sent['WORDS']:
                if tok['TEXT'] == word:
                    sents.append(sent)
    del docs                           # free up the memory                  
    return sents

# create a conditional frequency distribution for treebank words
# a special twist: group all similar POS tags together, e.g., all verbs are 'VB'
# user passes in how many docs to process

def find_treebank_counts(numdocs):
    from nltk.corpus import treebank
    cfd = ConditionalFreqDist()
    docs = [treebank.read(item) for item in treebank.items('tagged')[0:numdocs]]
    for doc in docs:
        for sent in doc['SENTS']:
            for tok in sent['WORDS']:
                tag = tok['POS']
                if len(tag) > 2: tag = tag[0:2] #group all Nouns, Verbs, etc
                cfd[tok['TEXT']].inc(tag)
                
    return cfd

# find the most frequent words with a given POS

def find_common_pos(num, pos, cfd):
    result= [(cfd[word].count(pos), word) for word in cfd.conditions()]
    result.sort(); result.reverse()
    return result[0:num]

             
# taken from the tutorial

def eval_chunker():
    from nltk.corpus import treebank
    rule = ChunkRule('<DT|JJ|NN>+', "Chunk sequences of DT, JJ, and NN")
    chunkparser = RegexpChunkParser([rule], chunk_node='NP',
                     top_node='S', TAG='POS', SUBTOKENS='WORDS')
    chunkscore = ChunkScore()

    for item in treebank.items('tagged')[:10]:
         for sent in treebank.read(item, add_locs=True)['SENTS']:
             test_sent = Token(WORDS = sent['TREE'].leaves())             
             chunkparser.parse(test_sent)
             chunkscore.score(sent['TREE'], test_sent['TREE'])

    print chunkscore