# # weka.py # Marti Hearst, Oct 2006 """weka.py Simple code to process input text and generate features for text categorization Outputs the features in the sparse ARFF format for Weka input.""" import os, re from nltk_lite import tokenize from nltk_lite.probability import FreqDist stopList = {} def buildStoplist(): list = ["about", "after", "again", "against", "ago", "all", "also", "am", "and", "another", "any", "are", "around", "as", "back", "because", "been", "before", "being", "best", "better", "bit", "but", "can", "com", "come", "could", "day", "did", "didnt", "don", "down", "eight", "even", "ever", "every", "few", "find", "first", "five", "font", "for", "four", "from", "get", "girl", "give", "going", "good", "got", "had", "has", "have", "he", "her", "here", "him", "his", "home", "house", "how", "hundred", "if", "in", "into", "is", "it", "just", "last", "let", "like", "lot", "made", "make", "many", "mean", "might", "million", "month", "more", "most", "much", "my", "name", "need", "never", "new", "next", "nine", "no", "not", "now", "off", "on", "one", "one", "only", "open", "other", "our", "out", "over", "own", "page", "para", "part", "people", "pm", "really", "said", "same", "say", "see", "seem", "seven", "she", "should", "show", "since", "site", "six", "size", "so", "some", "someone", "something", "still", "such", "sure", "take", "ten", "text", "than", "that", "the", "their", "them", "then", "there", "these", "they", "thing", "think", "this", "those", "thought", "thousand", "three", "through", "time", "to", "today", "too", "top", "two", "under", "us", "use", "very", "want", "was", "way", "we", "well", "were", "what", "when", "where", "which", "while", "who", "why", "will", "with", "without", "would", "you", "your", "i", "a", "of", "or", "on", "an", "by", "be", "up", "do", "don't", "i'm", "at"] for item in list: stopList[item] = 1 # Modify this function to do more sophisticated tokenization def buildTokenizerPattern(): hyphen = r'(\w+\-\s?\w+)' apostrophe = r'(\w+\'\w+)' numbers = r'((\$|#)?\d+(\.)?\d+%?)' punct = r'([^\w\s]+)' wordr = r'(\w+)' # not using punct in this regex r = "|".join([hyphen, apostrophe, numbers, wordr]) pattern = re.compile(r) return pattern # A filter to aid in feature selection. # Modify this to do more sophisticated feature selection. def featurePassesFilter(feature): if feature.isdigit(): return False return True # Build a newsgroup data structure. Probably no need to modify this. def buildNewsgroup(dirPrefix, newsgroupName, tokenizerPattern, maxNumFiles): path = os.path.join(dirPrefix, newsgroupName) files = os.listdir(path) # maxNumFiles = -1 means use all files in directory if maxNumFiles > 0: files = files[0:maxNumFiles] tokensDict = getTokensFiles(files, path, tokenizerPattern) # each entry in tokensDict is an fd newsgroupDict = {'name': newsgroupName, 'files': files, 'tokensDict': tokensDict.copy(), 'path': path} return newsgroupDict # Create a master list of all the features that are used # for every posting; we need a consistent numbering for # the ARFF format. Put all the features from the training set # here. Have to check the features from the test set against those # in this data structure, since the trained Weka model will not # work on an ARFF file that contains features it hasn't seen in training. # Probably no need to modify this. def buildMasterFD(fdList): masterFD = FreqDist() for sublist in fdList: for fd in sublist: for feature in fd.samples(): if featurePassesFilter(feature): masterFD.inc(feature, fd.count(feature)) return masterFD # This function calls getTokensFD on each file in a list of filenames. # It creates a dictionary, each entry of which is a FreqDist corresponding # to the contents of the indexing filename. Probably no need to modify this. def getTokensFiles(fileNames, path, pattern): dict = {} for fname in fileNames: tokensFD = getTokensFD(os.path.join(path, fname), pattern) dict[fname] = tokensFD return dict # Tokenize a newsgroup posting and store the resulting tokens # as a FreqDist. We do it this way to avoid having to tokenize # each file more than once. # # This is very bare-bones; may want to add stemming, phrase # recognition or other kinds of features. def getTokensFD(fname, tokenizerPattern): fd = FreqDist() text = open(fname).read() tokens = list(tokenize.regexp(text,tokenizerPattern)) for token in tokens: token = token.lower() if not stopList.get(token): fd.inc(token) return fd # Starter code to write out an input file for Weka. # Note that this assumes features are all of type numeric # (except the class variable newsgroup_class). # If you want to use a learner with nominal features you will # need to change this code. # If you adjust this, remember that the class variable *must* # come last for most of the learning algorithms. # outputFileName: where to write out the file; will use the local directory by default # relationName: a name for this datafile, used by weka for printing out info # sortedFeatures: feature list; must be identical for training and testing # newsgroupList: a list of data structures of type newsgroup, defined above. def write_sparse_arff(outputFileName, relationName, sortedFeatures, newsgroupList): # Open the output file outfile = open(outputFileName, "w") # Write out the first line with the Relation name outfile.write("@RELATION " + relationName + "\n\n") # Write out the feature names and their types for featureName in sortedFeatures: outfile.write("@ATTRIBUTE\t\"" + featureName + "\"\tNUMERIC\n") # Write out the information for the class variable newsgroupNames = [ng['name'] for ng in newsgroupList] outfile.write("@ATTRIBUTE\t" + "newsgroup_class" + "\t{" + \ ','.join(newsgroupNames) + "}\n") # Write out the data header outfile.write("\n@DATA\n\n") # Write out the data, one line per newsgroup posting for newsgroup in newsgroupList: postingFileNames = newsgroup['files'] newsgroupName = newsgroup['name'] newsgroupTokensDict = newsgroup['tokensDict'] for pfn in postingFileNames: # get the features for this newsgroup posting fileFD = newsgroupTokensDict[pfn] # output the features in fixed order outfile.write("{") featureIndex = 0 # ARFF file starts numbering features at 0 for feature in sortedFeatures: if fileFD.count(feature) > 0: outfile.write(str(featureIndex) + " " + \ str(fileFD.count(feature)) + ", ") featureIndex += 1 outfile.write(str(featureIndex) + " \"" + newsgroupName + "\" }\n") outfile.flush() outfile.close() # Build up a representation for each newsgroup (this includes tokenization) # Place newsgroup representations in a list # Create a master list of all features (needed to create ARFF file) # (this is based on all features from all news postings in the training set) # Put the best features in a list. Here I use the most frequent, with # the number of features is cut off by the featureLimit parameter. # Modify this, for example, for tf.idf weighting (may want to modify # in the buildMasterFD method). # Write all the data to an ARFF file. # Return the sortedFeatures list to be used for generating a test data file. def buildFeatureFilesTraining(basedir, relationName, newsgroupNamesList, numFilesLimit, featureLimit, outfileName): buildStoplist() tokenizerPattern = buildTokenizerPattern() newsgroupList = [] for newsgroupName in newsgroupNamesList: newsgroupList.append(buildNewsgroup(basedir, newsgroupName, tokenizerPattern, numFilesLimit)) allFDs = [newsgroup['tokensDict'].values() for newsgroup in newsgroupList] masterFD = buildMasterFD(allFDs) sortedFeatures = masterFD.sorted_samples()[0:featureLimit] outfile = os.path.join(basedir, outfileName) write_sparse_arff(outfile, relationName, sortedFeatures, newsgroupList) return sortedFeatures # This is similar to buildFeatureFileTraining, except the sortedFeatures list # is given: it must be identical to that used for generating the training ARFF file. def buildFeatureFilesTesting(basedir, relationName, newsgroupNamesList, numFilesLimit, featureLimit, outfileName, sortedFeatures): buildStoplist() tokenizerPattern = buildTokenizerPattern() newsgroupList = [] for newsgroupName in newsgroupNamesList: newsgroupList.append(buildNewsgroup(basedir, newsgroupName, tokenizerPattern, numFilesLimit)) outfile = os.path.join(basedir, outfileName) write_sparse_arff(outfile, relationName, sortedFeatures, newsgroupList) # test out the code on two newsgroups and 25 postings per group def doTraining(): curdir = os.getcwd() basedir = os.path.join(curdir, "newsgroups", "newsgroups_train") newsgroupNamesList = ["rec.motorcycles", "sci.space"] featureLimit = 100 outfileName = os.path.join(curdir, "diverse_train.arff") relationName = "diverse" numFilesPerNewsgroup = 25 buildFeatureFilesTraining(basedir, relationName, newsgroupNamesList, numFilesPerNewsgroup, featureLimit, outfileName) # Do both training and testing, retaining training features # for the testing file. def doTrainingAndTesting(): curdir = os.getcwd() baseTrainDir = os.path.join(curdir, "newsgroups", "newsgroups_train") newsgroupNamesList = ["rec.motorcycles", "sci.space"] featureLimit = 100 outfileName = os.path.join(curdir, "diverse_train.arff") relationName = "diverse" numFilesPerNewsgroup = 25 sortedFeatures = buildFeatureFilesTraining(baseTrainDir, relationName, newsgroupNamesList, numFilesPerNewsgroup, featureLimit, outfileName) baseTestDir = os.path.join(curdir, "newsgroups", "newsgroups_test") numTestFiles = -1 # use all test files outfileName = os.path.join(curdir, "diverse_test.arff") buildFeatureFilesTesting(baseTestDir, relationName, newsgroupNamesList, numTestFiles, featureLimit, outfileName, sortedFeatures)