数据挖掘实践指南读书笔记6 数据挖掘实践指南读书笔记6

写在之前本书涉及的源程序和数据都可以在以下网站中找到：http://guidetodatamining.com/
这本书理论比较简单，书中错误较少，动手锻炼较多，如果每个代码都自己写出来，收获不少。总结：适合入门。
欢迎转载，转载请注明出处，如有问题欢迎指正。
合集地址：https://www.zybuluo.com/hainingwyx/note/559139
朴素贝叶斯和文本训练阶段：

将标识为同一假设的文档合并成一个文本文件
计算词在该文件中的出现次数n,形成一个词汇表
对于词汇表中的每个词w_k计算器在文本中的出现次数，记为n_k
对词汇表中的每个词(去除停用词)w_k，计算

文章图片

class BayesText:def __init__(self, trainingdir, stopwordlist): """This class implements a naive Bayes approach to text classification trainingdir is the training data. Each subdirectory of trainingdir is titled with the name of the classification category -- those subdirectories in turn contain the text files for that category. The stopwordlist is a list of words (one per line) will be removed before any counting takes place. """ self.vocabulary = {} self.prob = {} self.totals = {} self.stopwords = {}#停用词字典 f = open(stopwordlist) for line in f: self.stopwords[line.strip()] = 1 f.close() categories = os.listdir(trainingdir) #filter out files that are not directories self.categories = [filename for filename in categories if os.path.isdir(trainingdir + filename)] print("Counting ...") for category in self.categories: print('' + category) # 计算当前类别的单词和单词数量，单词的总量 (self.prob[category], self.totals[category]) = self.train(trainingdir, category) # I am going to eliminate any word in the 所有种类的单词库vocabulary # that doesn't occur at least 3 times toDelete = [] for word in self.vocabulary: if self.vocabulary[word] < 3: # mark word for deletion # can't delete now because you can't delete # from a list you are currently iterating over toDelete.append(word) # now delete for word in toDelete: del self.vocabulary[word] # now compute probabilities vocabLength = len(self.vocabulary) print("Computing probabilities:") for category in self.categories: print('' + category) denominator = self.totals[category] + vocabLength for word in self.vocabulary: if word in self.prob[category]: count = self.prob[category][word] else: count = 1 # 条件概率计算 self.prob[category][word] = (float(count + 1) / denominator) print ("DONE TRAINING\n\n")# input：trainingdir训练文件的目录, category训练文件的种类 # return: (counts, total) (当前文件的单词和单词数量,所有单词的数量) def train(self, trainingdir, category): """counts word occurrences for a particular category""" currentdir = trainingdir + category files = os.listdir(currentdir) counts = {} total = 0 for file in files: #print(currentdir + '/' + file) f = codecs.open(currentdir + '/' + file, 'r', 'iso8859-1') for line in f: tokens = line.split() for token in tokens: # get rid of punctuation and lowercase token token = token.strip('\'".,?:-') token = token.lower() if token != '' and not token in self.stopwords: self.vocabulary.setdefault(token, 0) self.vocabulary[token] += 1#所有文档的单词和单词数量 counts.setdefault(token, 0) counts[token] += 1#当前文件的单词和单词数量 total += 1#所有单词的数量 f.close() return(counts, total) # test code bT = BayesText(trainingDir, stoplistfile) bT.prob['rec.motorcycles']["god"]

分类阶段：

文章图片

【数据挖掘实践指南读书笔记6】如果概率非常小，Python无法计算，可以采用取对数的形式。
停用词：当停用词是噪声时，去掉这些词能减少处理量，提高性能。个别情况下要重新考虑停用词。如性犯罪者会比一般人更多使用me、you这类词语。

def classify(self, itemVector, numVector): """Return class we think item Vector is in""" results = [] sqrt2pi = math.sqrt(2 * math.pi) for (category, prior) in self.prior.items(): prob = prior col = 1 for attrValue in itemVector: if not attrValue in self.conditional[category][col]: # we did not find any instances of this attribute value # occurring with this category so prob = 0 prob = 0 else: prob = prob * self.conditional[category][col][attrValue] col += 1 col = 1 for x innumVector: mean = self.means[category][col] ssd = self.ssd[category][col] ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2)) prob = prob * ((1.0 / (sqrt2pi*ssd)) * ePart) col += 1 results.append((prob, category)) # return the category with the highest probability #print(results) return(max(results)[1])# test code bT.classify(testDir+ 'rec.motorcycles/104673')

10-fold cross

from __future__ import print_function import os, codecs, mathclass BayesText: # input:训练文件目录，停用词，忽略的文件子集 def __init__(self, trainingdir, stopwordlist, ignoreBucket): """This class implements a naive Bayes approach to text classification trainingdir is the training data. Each subdirectory of trainingdir is titled with the name of the classification category -- those subdirectories in turn contain the text files for that category. The stopwordlist is a list of words (one per line) will be removed before any counting takes place. """ self.vocabulary = {} self.prob = {} self.totals = {} self.stopwords = {} f = open(stopwordlist) for line in f: self.stopwords[line.strip()] = 1 f.close() categories = os.listdir(trainingdir) #filter out files that are not directories，in this program, neg and pos self.categories = [filename for filename in categories if os.path.isdir(trainingdir + filename)] print("Counting ...") for category in self.categories: #print('' + category) (self.prob[category], self.totals[category]) = self.train(trainingdir, category, ignoreBucket) # I am going to eliminate any word in the vocabulary # that doesn't occur at least 3 times toDelete = [] for word in self.vocabulary: if self.vocabulary[word] < 3: # mark word for deletion # can't delete now because you can't delete # from a list you are currently iterating over toDelete.append(word) # now delete for word in toDelete: del self.vocabulary[word] # now compute probabilities vocabLength = len(self.vocabulary) #print("Computing probabilities:") for category in self.categories: #print('' + category) denominator = self.totals[category] + vocabLength for word in self.vocabulary: if word in self.prob[category]: count = self.prob[category][word] else: count = 1 self.prob[category][word] = (float(count + 1) / denominator) #print ("DONE TRAINING\n\n")def train(self, trainingdir, category, bucketNumberToIgnore): """counts word occurrences for a particular category""" ignore = "%i" % bucketNumberToIgnore currentdir = trainingdir + category directories = os.listdir(currentdir) counts = {} total = 0 for directory in directories: if directory != ignore: currentBucket = trainingdir + category + "/" + directory files = os.listdir(currentBucket) #print("" + currentBucket) for file in files: f = codecs.open(currentBucket + '/' + file, 'r', 'iso8859-1') for line in f: tokens = line.split() for token in tokens: # get rid of punctuation and lowercase token token = token.strip('\'".,?:-') token = token.lower() if token != '' and not token in self.stopwords: self.vocabulary.setdefault(token, 0) self.vocabulary[token] += 1 counts.setdefault(token, 0) counts[token] += 1 total += 1 f.close() return(counts, total)def classify(self, filename): results = {} for category in self.categories: results[category] = 0 f = codecs.open(filename, 'r', 'iso8859-1') for line in f: tokens = line.split() for token in tokens: #print(token) token = token.strip('\'".,?:-').lower() if token in self.vocabulary: for category in self.categories: if self.prob[category][token] == 0: print("%s %s" % (category, token)) results[category] += math.log( self.prob[category][token]) f.close() results = list(results.items()) results.sort(key=lambda tuple: tuple[1], reverse = True) # for debugging I can change this to give me the entire list return results[0][0]# input: 测试文件的分类目录，当前类别，忽略子集 # return: 当前类别下的分类结果{0:12,1：23} def testCategory(self, direc, category, bucketNumber): results = {} directory = direc + ("%i/" % bucketNumber) #print("Testing " + directory) files = os.listdir(directory) total = 0 #correct = 0 for file in files: total += 1 result = self.classify(directory + file) results.setdefault(result, 0) results[result] += 1 #if result == category: #correct += 1 return results# input: 测试文件目录，忽略的子集文件 # return: 所有类别的分类结果{1:{0:12,1：23},} def test(self, testdir, bucketNumber): """Test all files in the test directory--that directory is organized into subdirectories--each subdir is a classification category""" results = {} categories = os.listdir(testdir) #filter out files that are not directories categories = [filename for filename in categories if os.path.isdir(testdir + filename)] for category in categories: #print(".", end="") results[category] = self.testCategory( testdir + category + '/', category, bucketNumber) return resultsdef tenfold(dataPrefix, stoplist): results = {} for i in range(0,10): bT = BayesText(dataPrefix, stoplist, i) r = bT.test(theDir, i) for (key, value) in r.items(): results.setdefault(key, {}) for (ckey, cvalue) in value.items(): results[key].setdefault(ckey, 0) results[key][ckey] += cvalue categories = list(results.keys()) categories.sort() print("\nClassified as: ") header ="" subheader = "+" for category in categories: header += "% 2s" % category subheader += "-----+" print (header) print (subheader) total = 0.0 correct = 0.0 for category in categories: row = " %s|" % category for c2 in categories: if c2 in results[category]: count = results[category][c2] else: count = 0 row += " %3i |" % count total += count if c2 == category: correct += count print(row) print(subheader) print("\n%5.3f percent correct" %((correct * 100) / total)) print("total of %i instances" % total)# change these to match your directory structure prefixPath = "data/review_polarity/" theDir = prefixPath + "/txt_sentoken/" stoplistfile = prefixPath + "stopwords25.txt" tenfold(theDir, stoplistfile)