#!/usr/local/bin/python
#
#  Author: Preslav I. Nakov
#          UC Berkeley, nakov<AT>cs.berkeley.edu
#  
#  Description: Extracts features (non-stop words with a pre-specified minimum frequency)
#               from the "20 newsgroups" text collect and writes an ARFF file to be used by WEKA.
#               Allows TO specify:
#                  1) which subset of classes to consider
#                  2) number of training/testing documents for each class
#                  3) minimum feature frequency, so that a candidate is accepted as a feature
#                  4) regular expression pattern a token should match in order to be considerd as a potential feature
#                  5) whether to use a sparse data output
#                  6) whether to remove the stopwords
#                  7) whether to convert words to lowercase
#
#  PLEASE NOTE:
#  1) Any combination of the parameters is good for cross-validation
#  experiments.  But one will get incorrect (optimistic) results if the produced
#  ARFF is used in a test/train split by WEKA.  The problem is that the feature
#  frequences are calculated over all features and this. This will mean that we
#  used word frequency information from the test set, which is cheating!  You
#  are welcome to change the code accordingly but this is a little bit
#  tricky. Ask me if you have questions.
#
#  2) It is best to use the sparse data output as it produces much smaller files:
#  faster to generate and load.
#
#  3) You need the following nltk modules installed: "20_newsgroups" and "stopwords"
#
#  4) You need to change the file "__init__.py": in the "20 Newsgroups" section
#   substitute "/" by "\\"
#
#  5) There are now three ways to create training and test set files.
#   There are three demo functions that illustrate these.
#  The first demo generates a training and a testing file. Use the training file
#  with cross-validation to find the best classification features, algorithm,
#  parameters etc. Only use the testing file at the very end, when you report
#  (turn in) your results. Anything else would be transferring some knowledge from
#  the training to the testing set, which is cheating. Yes, even just checking
#  which algorithm works best on the testing data is cheating, if you use this
#  to determine what your best algorithm (parameters, features etc.) is. If you
#  want to do something like that, you need to further split the training data
#  into training and validation sets. Then you can do whatever you want with the
#  two sets.
#
#  When you've finished experimenting and you have decided on what the best
#  parameters are, you can train on your whole training data
#  (i.e. your_training + validation), test on the testing data and then report
#  your results.  
#
#  The second demo lets you set up training and test sets that don't touch
#  the last 200 documents in each group.  It allows you to
#  set parameters for how many documents are in each training/test set.
#
#  The third demo allows you to just produce a training set and set the
#  number of documents from each group.
#
#  6) Be very careful to not use any information about the testing set when
#  selecting/weighting the features!  If you estimate, e.g., IDF, you need to do
#  so using the training set only. Then you have to use them on the testing data
#  too. Look how the present code finds the infrequent terms: the frequency and
#  the TF filtering are calculated on the training set. Then only these selected
#  terms are used on both training and testing.  
#
#
#
#
#  Last modified: October 12, 2004
#
#

from nltk.corpus import twenty_newsgroups
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import re
import string


############################
###   GLOBAL VARIABLES   ###
############################
stopwordsDict  = {}   # list of stopwords
featurePattern = ''   # how a potential feature should look like
TEST_CNT       = 200  # number of documents left for testing

####################################################################################
###   NAME    : build_stoplist()                                                 ###
###   PURPOSE : Fetches a list of stopwords for English.                         ###
###   RETURNS : Dictionary of stopwords.                                         ###
###   NOTE    : Modify this function, if you want to use a different stoplist.   ###
####################################################################################
def build_stoplist(): 
	stopwordsDict = {}
	for stopword in stopwords.read('english')['WORDS']:
		stopwordsDict[stopword['TEXT']] = 0
	return stopwordsDict


##########################################################################################
###   NAME    : is_feature_good()                                                      ###
###   PURPOSE : Checks whether a feature looks good in general.                        ###
###   RETURNS : 1, if good; 0, otherwise.                                              ###
###   NOTE    : Modify this function, if you want to use a different feature filter.   ###
##########################################################################################
def is_feature_good (candidateFeature):
	if featurePattern.match(candidateFeature):                  # make sure the feature matches our pattern
		if stopwordsDict.get(candidateFeature.lower(),1) > 0:   # skip the stopwords (NOTE: we convert to lowercase on comparison)
			return 1                                            # good: keep it
	return 0                                                    # bad: filter it out


#######################################################################################################
###   NAME    : extract_features_and_freqs()                                                        ###
###   PURPOSE : Given a list of tokens creates a dictionary of the corresponding *good* features.   ###
###   RETURNS : Dictionary containing the set of features and their frequencies.                    ###
###   NOTE    : Modify this function, if you want to use different kinds of features.               ###
#######################################################################################################
def extract_features_and_freqs(tokens, convertToLowerCase):
	features = {}
	for token in tokens['WORDS']:
		tokenText = token['TEXT']
		if convertToLowerCase:
			tokenText = tokenText.lower()  # convert to lowercase
		if is_feature_good(tokenText):     # check whether the current token text is a good candidate
			features[tokenText] = features.get(tokenText,0) + 1
	return features


##############################################################################################
###   NAME    : extract_features_and_freqs_forall()                                        ###
###   PURPOSE : Creates a dictionary of the *good* features for *all* example documents.   ###
###   RETURNS : Dictionary containing the set of features and their frequencies.           ###
###   NOTE    : Modify this function to calculate IDF.                                     ###
##############################################################################################
def extract_features_and_freqs_forall(classes, convertToLowerCase):
	globalFeatureFreq = {}
	for newsgroup in classes:                                                    # iterate over the classes
		for item in classes[newsgroup]:                                          # iterate over the documents for the current class docs
			tokens = twenty_newsgroups.read(item)                                # read the set of tokens for the current document
			featureFreq = extract_features_and_freqs(tokens, convertToLowerCase) # extract the features and their frequencies
			for feature in featureFreq:
				globalFeatureFreq[feature] = globalFeatureFreq.get(feature,0) + featureFreq[feature]
	return globalFeatureFreq


##################################################################################
###   NAME    : filter_infrequent_features()                                   ###
###   PURPOSE : Filters out the infrequent features.                           ###
###   RETURNS : The feature dictionary with the infrequent features removed.   ###
###   NOTE    : You probably do not need to modify this function.              ###
##################################################################################
def filter_infrequent_features (featureDict, minFeatureFreq):
	newDict = {}
	for feature in featureDict:
		if featureDict[feature] >= minFeatureFreq:
			newDict[feature] = featureDict[feature]
	return newDict


#######################################################################
###   NAME    : write_WEKA_input()                                  ###
###   PURPOSE : Writes an ARFF file to be used by WEKA.             ###
###   RETURNS : Nothing.                                            ###
###   NOTE    : You probably do not need to modify this function.   ###
#######################################################################
def write_WEKA_input(fileName, featureDict, relationName, classes, writeSparseARFF, convertToLowerCase):

	# 0. Sort the features alphabetically
	sortedFeatures = featureDict.keys()
	sortedFeatures.sort()

	# 1. Output the list of the selected features
	outFile = open(fileName, "w")
	# 1.1. Output the very first line: with the WEKA relation name
	outFile.write("@RELATION " + relationName + "\n\n") 
	# 1.2. Output the list of features, all of type NUMERIC (you might want e.g. nominal features)
	for feature in sortedFeatures:
		outFile.write("@ATTRIBUTE\t" + feature + "\tNUMERIC\n")
	# 1.3. Output the class variable: it is nominal, i.e. just enumerate the classes
	outFile.write("@ATTRIBUTE\tclass\t{" + string.join(classes,', ') + "}\n") # possible classes

	# 2. Output the example representation
	outFile.write("\n@DATA\n\n")
	for newsgroup in classes:
		for item in classes[newsgroup]:
			# 2.1. Extract the features			
			tokens = twenty_newsgroups.read(item)                          # extract the tokens
			freqs = extract_features_and_freqs(tokens, convertToLowerCase) # extract the features for the current example
			# 2.2. Output the features for the current example
			if writeSparseARFF:
				outFile.write('{')
				featIndex = 0
				for feature in sortedFeatures:                             # note that we iterate over *all* features
					if freqs.get(feature,0) > 0:                           # but we output the ones for the current example only
						outFile.write(str(featIndex) + " " + str(freqs[feature]) + ",")
					featIndex = featIndex + 1
				outFile.write(str(featIndex) + " " + newsgroup + "}\n");   # output the value for the class variable
			else:
				for feature in sortedFeatures:                             # note that we iterate over *all* features
					outFile.write(str(freqs[feature]) + ",")
				outFile.write(newsgroup + "\n");
	outFile.close()


###########################################################################
###   NAME    : write_ARFF()                                            ###
###   PURPOSE : Demonstrates how to use the functions above together.   ###
###   RETURNS : Nothing.                                                ###
###   NOTE    : You probably do not need to modify this function.       ###
###########################################################################
def write_ARFF (minFeatureFreq,            # minimum feature frequency, so that a candidate is accepted as a feature
				removeStopWords,           # 1: remove the stopwords; 0: keep them
				featurePattrn,             # regular expression pattern a potential feature should match; used as a filter
				convertToLowerCase,        # 1: convert to lowercase; 0: keep the original case
				writeSparseARFF,           # 1: produce sparse output (attribute=value); 0: produce classic output (full vector)
				arffRelationName,          # relation name to be output on the first line of the ARFF file
				clsTraining,               # training: set of classes and number of documents for each class
				outputFileNameTrain,       # training: output ARFF file name
				clsTesting = [],           # testing: set of classes and number of documents for each class	(OPTIONAL)
				outputFileNameTest = []):  # testing: output ARFF file name (OPTIONAL)
				
	global featurePattern   # global, as the filter functions above may not need it, and then we do not want to pass it as a parameter
	global stopwordsDict    # global, as the filter functions above may not need it, and then we do not want to pass it as a parameter

	# 1. Create a regular expression for the good features	
	featurePattern = re.compile(featurePattrn)

	# 2. Fetch the stopwords list, if needed (otherwise it is left empty)
	if removeStopWords: stopwordsDict = build_stoplist()

	# 3. Training: find the features and their frequencies
	featureDictTrain = extract_features_and_freqs_forall(clsTraining, convertToLowerCase)

	# 4. Training: filter the infrequent features
	featureDictFilteredTrain = filter_infrequent_features(featureDictTrain, minFeatureFreq)

	# 5. Training: write ARFF file	
	write_WEKA_input(outputFileNameTrain, featureDictFilteredTrain, arffRelationName,
					 clsTraining, writeSparseARFF, convertToLowerCase)

	# 6. Testing: write ARFF file
	if clsTesting != []:
		write_WEKA_input(outputFileNameTest, featureDictFilteredTrain, arffRelationName,
						 clsTesting, writeSparseARFF, convertToLowerCase)


########################################################################################
###   NAME    : get_classes_all()                                                    ###
###   PURPOSE : Extracts a list of the files to be used for training and testing.    ###
###             testing: TEST_CNT files from each category; training: all the rest   ###
###   RETURNS : The sets of training and testing documents.                          ###
###   NOTE    : You probably do not need to modify this function.                    ###
########################################################################################
def get_classes_all (clsSet):
	clsTraining = {}   # training sets
	clsTesting  = {}   # testing sets
	for cls in clsSet:
		clsDocs  = twenty_newsgroups.items(cls)                    # the set of documents assigned to cls
		clsDocsCnt = len(clsDocs)                                  # the total number of documents assigned to cls
		trainCnt = len(clsDocs) - TEST_CNT                         # total number of docs for that class
		clsTraining[cls] = clsDocs[0:trainCnt]                     # get the training part for that class
		clsTesting[cls]  = clsDocs[trainCnt:(trainCnt+TEST_CNT)]   # get the testing part for that class
	return (clsTraining, clsTesting)


######################################################################################################
###   NAME    : get_classes_TRAIN_ONLY()                                                           ###
###   PURPOSE : Extracts a list of the files to be used for training and testing.                  ###
###             SPLITS the TRAINING files into train-training and train-testing.                   ###
###             The number of documents in train-testing and train-training is user-specified      ###
###             The train-testing part is optional as one might want cross-validation only.        ###
###   RETURNS : The sets of training and testing documents.                                        ###
###             The first files are for training, the next for testing, and the rest are unused.   ###
###   NOTE    : You probably do not need to modify this function.                                  ###
######################################################################################################
def get_classes_TRAIN_ONLY (clsSets):
	clsTraining = {}   # training sets
	clsTesting  = {}   # testing sets
	for cls in clsSets:

		# 1. Get the set of documents assigned to the current class
		clsDocs  = twenty_newsgroups.items(cls)

		# 2. Make sure no test documents are used
		usedDocs = clsSets[cls]['TRAIN'] + clsSets[cls].get('TEST',0)
		if len(clsDocs) - usedDocs < TEST_CNT:
			raise RuntimeError, 'TOO MANY DOCUMENTS!' + "\n" \
							  + 'You cannot use that many documents (' + str(usedDocs) + ') for ' + cls + "\n" \
							  + "As this takes some of the last " + str(TEST_CNT) + ", i.e. the TESTING documents!\n" \
							  + 'The maximal allowed total for this class is ' + str(len(clsDocs) - TEST_CNT)

		# 3. Get the training part for that class
		clsTraining[cls] = clsDocs[0:clsSets[cls]['TRAIN']]

		# 4. Get the testing part for that class
		if clsSets[cls].get('TEST',0) > 0: # Make sure there is testing number of documents
			clsTesting[cls] = clsDocs[clsSets[cls]['TRAIN']:(clsSets[cls]['TRAIN']+clsSets[cls]['TEST'])]
		
	return (clsTraining, clsTesting)


#########################################################################
###                                                                   ###
###   USAGE DEMONSTRATION 1: Train on all but the last 200 examples   ###
###                          Test on the last 200 examples.           ###
###                                                                   ###
#########################################################################
def demo_train_test ():
	# 1. Create a set of class names
	homogeneousSet = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']

	# 2. Extract a list of the files to be used for training and testing
	(clsTraining, clsTesting) = get_classes_all(homogeneousSet)

	# 3. Write the WEKA input file
	write_ARFF (5,                            # minimum feature frequency, so that a candidate is accepted as a feature
				1,                            # 1: remove the stopwords; 0: keep them
				"^[a-zA-Z]+$",                # regular expression pattern a potential feature should match; used as a filter
				1,                            # 1: convert to lowercase; 0: keep the original case
				1,                            # 1: produce sparse output (attribute=value); 0: produce classic output (full vector)
				"TwentyNewsGroups",           # relation name to be output on the first line of the ARFF file
				clsTraining,                  # training: set of classes and number of documents for each class
				"c:\\homogeneous_train.arff", # training: output ARFF file name
				clsTesting,                   # testing: set of classes and number of documents for each class			
				"c:\\homogeneous_test.arff")  # testing: output ARFF file name


#########################################################################################################
###                                                                                                   ###
###   USAGE DEMONSTRATION 2: Uses the TRAINING set only! Does not allow using the last 200 files!     ###
###                          Splits TRAIN into train-train and train-test parts.                      ###
###                          Requires user-specified number of files for train-train an train-test.   ###
###                                                                                                   ###
#########################################################################################################
def demo_TRAINING_ONLY_split_train_test (): ### NOTE: Uses the TRAINING files only!!!
	# 1. Create a set of class names
	homogSet = {}
	homogSet['rec.autos']          = {'TRAIN': 100, 'TEST': 50}
	homogSet['rec.motorcycles']    = {'TRAIN': 100, 'TEST': 50}
	homogSet['rec.sport.baseball'] = {'TRAIN': 100, 'TEST': 50}
	homogSet['rec.sport.hockey']   = {'TRAIN': 100, 'TEST': 50}

	# 2. Extract a list of the files to be used for training and testing
	(clsTraining, clsTesting) = get_classes_TRAIN_ONLY(homogSet)

	# 3. Write the WEKA input file
	write_ARFF (5,                                  # minimum feature frequency, so that a candidate is accepted as a feature
				1,                                  # 1: remove the stopwords; 0: keep them
				"^[a-zA-Z]+$",                      # regular expression pattern a potential feature should match; used as a filter
				1,                                  # 1: convert to lowercase; 0: keep the original case
				1,                                  # 1: produce sparse output (attribute=value); 0: produce classic output (full vector)
				"TwentyNewsGroups",                 # relation name to be output on the first line of the ARFF file
				clsTraining,                        # training: set of classes and number of documents for each class
				"c:\\homogeneous_train_train.arff", # training: output ARFF file name
				clsTesting,                         # testing: set of classes and number of documents for each class			
				"c:\\homogeneous_train_test.arff")  # testing: output ARFF file name


#########################################################################################################
###                                                                                                   ###
###   USAGE DEMONSTRATION 3: Uses the TRAINING set only! Does not allow using the last 200 files!     ###
###                          Requires user-specified number of files.                                 ###
###                          Produces one file only, which is good for cross-validation.              ###
###                                                                                                   ###
#########################################################################################################
def demo_TRAINING_ONLY_cross_validation (): ### NOTE: Uses the TRAINING files only!!!
	# 1. Create a set of class names
	homogSet = {}
	homogSet['rec.autos']          = {'TRAIN': 100}
	homogSet['rec.motorcycles']    = {'TRAIN': 100}
	homogSet['rec.sport.baseball'] = {'TRAIN': 100}
	homogSet['rec.sport.hockey']   = {'TRAIN': 100}

	# 2. Extract a list of the files to be used for training and testing
	clsTraining = get_classes_TRAIN_ONLY(homogSet)[0]

	# 3. Write the WEKA input file
	write_ARFF (5,                                       # minimum feature frequency, so that a candidate is accepted as a feature
				1,                                       # 1: remove the stopwords; 0: keep them
				"^[a-zA-Z]+$",                           # regular expression pattern a potential feature should match; used as a filter
				1,                                       # 1: convert to lowercase; 0: keep the original case
				1,                                       # 1: produce sparse output (attribute=value); 0: produce classic output (full vector)
				"TwentyNewsGroups",                      # relation name to be output on the first line of the ARFF file
				clsTraining,                             # training: set of classes and number of documents for each class
				"c:\\homog_small_cross_validation.arff") # training: output ARFF file name