#!/usr/local/bin/python # # Author: Preslav I. Nakov # UC Berkeley, nakovcs.berkeley.edu # # Description: Extracts features (non-stop words with a pre-specified minimum frequency) # from the "20 newsgroups" text collect and writes an ARFF file to be used by WEKA. # Allows TO specify: # 1) which subset of classes to consider # 2) number of training/testing documents for each class # 3) minimum feature frequency, so that a candidate is accepted as a feature # 4) regular expression pattern a token should match in order to be considerd as a potential feature # 5) whether to use a sparse data output # 6) whether to remove the stopwords # 7) whether to convert words to lowercase # # PLEASE NOTE: # 1) Any combination of the parameters is good for cross-validation # experiments. But one will get incorrect (optimistic) results if the produced # ARFF is used in a test/train split by WEKA. The problem is that the feature # frequences are calculated over all features and this. This will mean that we # used word frequency information from the test set, which is cheating! You # are welcome to change the code accordingly but this is a little bit # tricky. Ask me if you have questions. # # 2) It is best to use the sparse data output as it produces much smaller files: # faster to generate and load. # # 3) You need the following nltk modules installed: "20_newsgroups" and "stopwords" # # 4) You need to change the file "__init__.py": in the "20 Newsgroups" section # substitute "/" by "\\" # # 5) There are now three ways to create training and test set files. # There are three demo functions that illustrate these. # The first demo generates a training and a testing file. Use the training file # with cross-validation to find the best classification features, algorithm, # parameters etc. Only use the testing file at the very end, when you report # (turn in) your results. Anything else would be transferring some knowledge from # the training to the testing set, which is cheating. Yes, even just checking # which algorithm works best on the testing data is cheating, if you use this # to determine what your best algorithm (parameters, features etc.) is. If you # want to do something like that, you need to further split the training data # into training and validation sets. Then you can do whatever you want with the # two sets. # # When you've finished experimenting and you have decided on what the best # parameters are, you can train on your whole training data # (i.e. your_training + validation), test on the testing data and then report # your results. # # The second demo lets you set up training and test sets that don't touch # the last 200 documents in each group. It allows you to # set parameters for how many documents are in each training/test set. # # The third demo allows you to just produce a training set and set the # number of documents from each group. # # 6) Be very careful to not use any information about the testing set when # selecting/weighting the features! If you estimate, e.g., IDF, you need to do # so using the training set only. Then you have to use them on the testing data # too. Look how the present code finds the infrequent terms: the frequency and # the TF filtering are calculated on the training set. Then only these selected # terms are used on both training and testing. # # # # # Last modified: October 12, 2004 # # from nltk.corpus import twenty_newsgroups from nltk.corpus import stopwords from nltk.probability import FreqDist import re import string ############################ ### GLOBAL VARIABLES ### ############################ stopwordsDict = {} # list of stopwords featurePattern = '' # how a potential feature should look like TEST_CNT = 200 # number of documents left for testing #################################################################################### ### NAME : build_stoplist() ### ### PURPOSE : Fetches a list of stopwords for English. ### ### RETURNS : Dictionary of stopwords. ### ### NOTE : Modify this function, if you want to use a different stoplist. ### #################################################################################### def build_stoplist(): stopwordsDict = {} for stopword in stopwords.read('english')['WORDS']: stopwordsDict[stopword['TEXT']] = 0 return stopwordsDict ########################################################################################## ### NAME : is_feature_good() ### ### PURPOSE : Checks whether a feature looks good in general. ### ### RETURNS : 1, if good; 0, otherwise. ### ### NOTE : Modify this function, if you want to use a different feature filter. ### ########################################################################################## def is_feature_good (candidateFeature): if featurePattern.match(candidateFeature): # make sure the feature matches our pattern if stopwordsDict.get(candidateFeature.lower(),1) > 0: # skip the stopwords (NOTE: we convert to lowercase on comparison) return 1 # good: keep it return 0 # bad: filter it out ####################################################################################################### ### NAME : extract_features_and_freqs() ### ### PURPOSE : Given a list of tokens creates a dictionary of the corresponding *good* features. ### ### RETURNS : Dictionary containing the set of features and their frequencies. ### ### NOTE : Modify this function, if you want to use different kinds of features. ### ####################################################################################################### def extract_features_and_freqs(tokens, convertToLowerCase): features = {} for token in tokens['WORDS']: tokenText = token['TEXT'] if convertToLowerCase: tokenText = tokenText.lower() # convert to lowercase if is_feature_good(tokenText): # check whether the current token text is a good candidate features[tokenText] = features.get(tokenText,0) + 1 return features ############################################################################################## ### NAME : extract_features_and_freqs_forall() ### ### PURPOSE : Creates a dictionary of the *good* features for *all* example documents. ### ### RETURNS : Dictionary containing the set of features and their frequencies. ### ### NOTE : Modify this function to calculate IDF. ### ############################################################################################## def extract_features_and_freqs_forall(classes, convertToLowerCase): globalFeatureFreq = {} for newsgroup in classes: # iterate over the classes for item in classes[newsgroup]: # iterate over the documents for the current class docs tokens = twenty_newsgroups.read(item) # read the set of tokens for the current document featureFreq = extract_features_and_freqs(tokens, convertToLowerCase) # extract the features and their frequencies for feature in featureFreq: globalFeatureFreq[feature] = globalFeatureFreq.get(feature,0) + featureFreq[feature] return globalFeatureFreq ################################################################################## ### NAME : filter_infrequent_features() ### ### PURPOSE : Filters out the infrequent features. ### ### RETURNS : The feature dictionary with the infrequent features removed. ### ### NOTE : You probably do not need to modify this function. ### ################################################################################## def filter_infrequent_features (featureDict, minFeatureFreq): newDict = {} for feature in featureDict: if featureDict[feature] >= minFeatureFreq: newDict[feature] = featureDict[feature] return newDict ####################################################################### ### NAME : write_WEKA_input() ### ### PURPOSE : Writes an ARFF file to be used by WEKA. ### ### RETURNS : Nothing. ### ### NOTE : You probably do not need to modify this function. ### ####################################################################### def write_WEKA_input(fileName, featureDict, relationName, classes, writeSparseARFF, convertToLowerCase): # 0. Sort the features alphabetically sortedFeatures = featureDict.keys() sortedFeatures.sort() # 1. Output the list of the selected features outFile = open(fileName, "w") # 1.1. Output the very first line: with the WEKA relation name outFile.write("@RELATION " + relationName + "\n\n") # 1.2. Output the list of features, all of type NUMERIC (you might want e.g. nominal features) for feature in sortedFeatures: outFile.write("@ATTRIBUTE\t" + feature + "\tNUMERIC\n") # 1.3. Output the class variable: it is nominal, i.e. just enumerate the classes outFile.write("@ATTRIBUTE\tclass\t{" + string.join(classes,', ') + "}\n") # possible classes # 2. Output the example representation outFile.write("\n@DATA\n\n") for newsgroup in classes: for item in classes[newsgroup]: # 2.1. Extract the features tokens = twenty_newsgroups.read(item) # extract the tokens freqs = extract_features_and_freqs(tokens, convertToLowerCase) # extract the features for the current example # 2.2. Output the features for the current example if writeSparseARFF: outFile.write('{') featIndex = 0 for feature in sortedFeatures: # note that we iterate over *all* features if freqs.get(feature,0) > 0: # but we output the ones for the current example only outFile.write(str(featIndex) + " " + str(freqs[feature]) + ",") featIndex = featIndex + 1 outFile.write(str(featIndex) + " " + newsgroup + "}\n"); # output the value for the class variable else: for feature in sortedFeatures: # note that we iterate over *all* features outFile.write(str(freqs[feature]) + ",") outFile.write(newsgroup + "\n"); outFile.close() ########################################################################### ### NAME : write_ARFF() ### ### PURPOSE : Demonstrates how to use the functions above together. ### ### RETURNS : Nothing. ### ### NOTE : You probably do not need to modify this function. ### ########################################################################### def write_ARFF (minFeatureFreq, # minimum feature frequency, so that a candidate is accepted as a feature removeStopWords, # 1: remove the stopwords; 0: keep them featurePattrn, # regular expression pattern a potential feature should match; used as a filter convertToLowerCase, # 1: convert to lowercase; 0: keep the original case writeSparseARFF, # 1: produce sparse output (attribute=value); 0: produce classic output (full vector) arffRelationName, # relation name to be output on the first line of the ARFF file clsTraining, # training: set of classes and number of documents for each class outputFileNameTrain, # training: output ARFF file name clsTesting = [], # testing: set of classes and number of documents for each class (OPTIONAL) outputFileNameTest = []): # testing: output ARFF file name (OPTIONAL) global featurePattern # global, as the filter functions above may not need it, and then we do not want to pass it as a parameter global stopwordsDict # global, as the filter functions above may not need it, and then we do not want to pass it as a parameter # 1. Create a regular expression for the good features featurePattern = re.compile(featurePattrn) # 2. Fetch the stopwords list, if needed (otherwise it is left empty) if removeStopWords: stopwordsDict = build_stoplist() # 3. Training: find the features and their frequencies featureDictTrain = extract_features_and_freqs_forall(clsTraining, convertToLowerCase) # 4. Training: filter the infrequent features featureDictFilteredTrain = filter_infrequent_features(featureDictTrain, minFeatureFreq) # 5. Training: write ARFF file write_WEKA_input(outputFileNameTrain, featureDictFilteredTrain, arffRelationName, clsTraining, writeSparseARFF, convertToLowerCase) # 6. Testing: write ARFF file if clsTesting != []: write_WEKA_input(outputFileNameTest, featureDictFilteredTrain, arffRelationName, clsTesting, writeSparseARFF, convertToLowerCase) ######################################################################################## ### NAME : get_classes_all() ### ### PURPOSE : Extracts a list of the files to be used for training and testing. ### ### testing: TEST_CNT files from each category; training: all the rest ### ### RETURNS : The sets of training and testing documents. ### ### NOTE : You probably do not need to modify this function. ### ######################################################################################## def get_classes_all (clsSet): clsTraining = {} # training sets clsTesting = {} # testing sets for cls in clsSet: clsDocs = twenty_newsgroups.items(cls) # the set of documents assigned to cls clsDocsCnt = len(clsDocs) # the total number of documents assigned to cls trainCnt = len(clsDocs) - TEST_CNT # total number of docs for that class clsTraining[cls] = clsDocs[0:trainCnt] # get the training part for that class clsTesting[cls] = clsDocs[trainCnt:(trainCnt+TEST_CNT)] # get the testing part for that class return (clsTraining, clsTesting) ###################################################################################################### ### NAME : get_classes_TRAIN_ONLY() ### ### PURPOSE : Extracts a list of the files to be used for training and testing. ### ### SPLITS the TRAINING files into train-training and train-testing. ### ### The number of documents in train-testing and train-training is user-specified ### ### The train-testing part is optional as one might want cross-validation only. ### ### RETURNS : The sets of training and testing documents. ### ### The first files are for training, the next for testing, and the rest are unused. ### ### NOTE : You probably do not need to modify this function. ### ###################################################################################################### def get_classes_TRAIN_ONLY (clsSets): clsTraining = {} # training sets clsTesting = {} # testing sets for cls in clsSets: # 1. Get the set of documents assigned to the current class clsDocs = twenty_newsgroups.items(cls) # 2. Make sure no test documents are used usedDocs = clsSets[cls]['TRAIN'] + clsSets[cls].get('TEST',0) if len(clsDocs) - usedDocs < TEST_CNT: raise RuntimeError, 'TOO MANY DOCUMENTS!' + "\n" \ + 'You cannot use that many documents (' + str(usedDocs) + ') for ' + cls + "\n" \ + "As this takes some of the last " + str(TEST_CNT) + ", i.e. the TESTING documents!\n" \ + 'The maximal allowed total for this class is ' + str(len(clsDocs) - TEST_CNT) # 3. Get the training part for that class clsTraining[cls] = clsDocs[0:clsSets[cls]['TRAIN']] # 4. Get the testing part for that class if clsSets[cls].get('TEST',0) > 0: # Make sure there is testing number of documents clsTesting[cls] = clsDocs[clsSets[cls]['TRAIN']:(clsSets[cls]['TRAIN']+clsSets[cls]['TEST'])] return (clsTraining, clsTesting) ######################################################################### ### ### ### USAGE DEMONSTRATION 1: Train on all but the last 200 examples ### ### Test on the last 200 examples. ### ### ### ######################################################################### def demo_train_test (): # 1. Create a set of class names homogeneousSet = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'] # 2. Extract a list of the files to be used for training and testing (clsTraining, clsTesting) = get_classes_all(homogeneousSet) # 3. Write the WEKA input file write_ARFF (5, # minimum feature frequency, so that a candidate is accepted as a feature 1, # 1: remove the stopwords; 0: keep them "^[a-zA-Z]+$", # regular expression pattern a potential feature should match; used as a filter 1, # 1: convert to lowercase; 0: keep the original case 1, # 1: produce sparse output (attribute=value); 0: produce classic output (full vector) "TwentyNewsGroups", # relation name to be output on the first line of the ARFF file clsTraining, # training: set of classes and number of documents for each class "c:\\homogeneous_train.arff", # training: output ARFF file name clsTesting, # testing: set of classes and number of documents for each class "c:\\homogeneous_test.arff") # testing: output ARFF file name ######################################################################################################### ### ### ### USAGE DEMONSTRATION 2: Uses the TRAINING set only! Does not allow using the last 200 files! ### ### Splits TRAIN into train-train and train-test parts. ### ### Requires user-specified number of files for train-train an train-test. ### ### ### ######################################################################################################### def demo_TRAINING_ONLY_split_train_test (): ### NOTE: Uses the TRAINING files only!!! # 1. Create a set of class names homogSet = {} homogSet['rec.autos'] = {'TRAIN': 100, 'TEST': 50} homogSet['rec.motorcycles'] = {'TRAIN': 100, 'TEST': 50} homogSet['rec.sport.baseball'] = {'TRAIN': 100, 'TEST': 50} homogSet['rec.sport.hockey'] = {'TRAIN': 100, 'TEST': 50} # 2. Extract a list of the files to be used for training and testing (clsTraining, clsTesting) = get_classes_TRAIN_ONLY(homogSet) # 3. Write the WEKA input file write_ARFF (5, # minimum feature frequency, so that a candidate is accepted as a feature 1, # 1: remove the stopwords; 0: keep them "^[a-zA-Z]+$", # regular expression pattern a potential feature should match; used as a filter 1, # 1: convert to lowercase; 0: keep the original case 1, # 1: produce sparse output (attribute=value); 0: produce classic output (full vector) "TwentyNewsGroups", # relation name to be output on the first line of the ARFF file clsTraining, # training: set of classes and number of documents for each class "c:\\homogeneous_train_train.arff", # training: output ARFF file name clsTesting, # testing: set of classes and number of documents for each class "c:\\homogeneous_train_test.arff") # testing: output ARFF file name ######################################################################################################### ### ### ### USAGE DEMONSTRATION 3: Uses the TRAINING set only! Does not allow using the last 200 files! ### ### Requires user-specified number of files. ### ### Produces one file only, which is good for cross-validation. ### ### ### ######################################################################################################### def demo_TRAINING_ONLY_cross_validation (): ### NOTE: Uses the TRAINING files only!!! # 1. Create a set of class names homogSet = {} homogSet['rec.autos'] = {'TRAIN': 100} homogSet['rec.motorcycles'] = {'TRAIN': 100} homogSet['rec.sport.baseball'] = {'TRAIN': 100} homogSet['rec.sport.hockey'] = {'TRAIN': 100} # 2. Extract a list of the files to be used for training and testing clsTraining = get_classes_TRAIN_ONLY(homogSet)[0] # 3. Write the WEKA input file write_ARFF (5, # minimum feature frequency, so that a candidate is accepted as a feature 1, # 1: remove the stopwords; 0: keep them "^[a-zA-Z]+$", # regular expression pattern a potential feature should match; used as a filter 1, # 1: convert to lowercase; 0: keep the original case 1, # 1: produce sparse output (attribute=value); 0: produce classic output (full vector) "TwentyNewsGroups", # relation name to be output on the first line of the ARFF file clsTraining, # training: set of classes and number of documents for each class "c:\\homog_small_cross_validation.arff") # training: output ARFF file name