###  ANLP ASSIGNMENT 2 CODE
###  Andrew T. Fiore
###  September 2004
from nltk.parser.chunk import *
from nltk.probability import ConditionalFreqDist
####
## Following functions from Marti's sample code and the NLTK tutorial
####
# Grab the treebank sentences that contain the given word, numdocs documents
# Return in a list
# This is not very efficient; would be best to grab all the docs' tokens once
# and reuse.  Here I free up the memory after using it.
def find_treebank_sents_counts(numdocs, word):
    from nltk.corpus import treebank
    sents = []
    docs = [treebank.read(item) for item in treebank.items('tagged')[0:numdocs]]
    for doc in docs:
        for sent in doc['SENTS']:
            for tok in sent['WORDS']:
                if tok['TEXT'] == word:
                    sents.append(sent)
    del docs                           # free up the memory                  
    return sents
# create a conditional frequency distribution for treebank words
# a special twist: group all similar POS tags together, e.g., all verbs are 'VB'
# user passes in how many docs to process
def find_treebank_counts(numdocs):
    from nltk.corpus import treebank
    cfd = ConditionalFreqDist()
    docs = [treebank.read(item) for item in treebank.items('tagged')[0:numdocs]]
    for doc in docs:
        for sent in doc['SENTS']:
            for tok in sent['WORDS']:
                tag = tok['POS']
                if len(tag) > 2: tag = tag[0:2] #group all Nouns, Verbs, etc
                cfd[tok['TEXT']].inc(tag)                
    return cfd
# find the most frequent words with a given POS
def find_common_pos(num, pos, cfd):
    result= [(cfd[word].count(pos), word) for word in cfd.conditions()]
    result.sort(); result.reverse()
    return result[0:num]
             
# taken from the tutorial
def eval_chunker():
    from nltk.corpus import treebank
    rule = ChunkRule('
+', "Chunk sequences of DT, JJ, and NN")
    chunkparser = RegexpChunkParser([rule], chunk_node='NP',
                     top_node='S', TAG='POS', SUBTOKENS='WORDS')
    chunkscore = ChunkScore()
    for item in treebank.items('tagged')[:10]:
         for sent in treebank.read(item, add_locs=True)['SENTS']:
             test_sent = Token(WORDS = sent['TREE'].leaves())             
             chunkparser.parse(test_sent)
             chunkscore.score(sent['TREE'], test_sent['TREE'])
    print chunkscore
def chunk_sentences_base(sents):
    npp = create_np_parser()
    ppp = create_pp_parser()
    vpp = create_vp_parser()
    for sent in sents:
        npp.parse(sent)
        ppp.parse(sent)
        vpp.parse(sent)
    return sents
def chunk_sentences_atf(sents):
    npp = create_np_parser_atf()
    ppp = create_pp_parser_atf()
    vpp = create_vp_parser_atf()
    for sent in sents:
        npp.parse(sent)
        ppp.parse(sent)
        vpp.parse(sent)
    return sents
### ORIGINAL RULES
def create_np_parser ():
    rule = ChunkRule(r'(??)?*(<,>)*()+', 'Chunk NPs')
    parser = RegexpChunkParser([rule],
            chunk_node='NP',   # the name to assign to matched substrings (optional)
            TAG='POS',         # where to find the POS tags
            top_node='S',      # the name to assign the top tree node  (optional)
            SUBTOKENS='WORDS', # which set of tokens to use as input
            TREE='NP-CHUNKS')  # the name for the output tokens of this parse (optional)
    return parser
# Label PPs with NP-CHUNKS (overwriting earlier NP-CHUNKS) so output
# will work with VP-chunks
def create_pp_parser ():
    rule = ChunkRule(r'()+', 'Chunk PPs')
    parser = RegexpChunkParser([rule], chunk_node = 'PP', TAG='POS', top_node='S',
                               SUBTOKENS='NP-CHUNKS', TREE='NP-CHUNKS')
    return parser
# VP parser uses the tokens called NP-CHUNKS as input
# It produces as output a new token called VP-CHUNKS
def create_vp_parser ():
    rule = ChunkRule(r'?()+(