机器学习-新闻分类案例机器学习

【机器学习-新闻分类案例】
新闻分类案例

项目概述
用TF-IDF和词袋表示文档特征

使用 CounterVectorizer 和 TfidfTransformer 计算 TF-IDF
直接使用 TfidfVectorizer

完整过程_词袋模型
训练word2vec模型.ipynb
完整过程_word2vec模型
项目集成

main.py
predicter.py

项目概述

import numpy as np import pandas as pd# 查看训练数据 train_data = https://www.it610.com/article/pd.read_csv('data/sohu_train.txt', sep='\t', header=None, dtype=np.str_, encoding='utf8', names=[u'频道', u'文章']) train_data.head(10)#查看前十行# 查看每个频道下文章数量 train_data.groupby(u'频道')[u'频道'].count() #按照频道分组然后统计每一组的行数# 查看每个频道下最短、最长文章字数 train_data.loc[:,u'文章长度'] = train_data[u'文章'].apply(len)#计算出每篇新闻的长度 train_data.head()#按照频道分组计算每组的长度的最小值和最大值 train_data.groupby(u'频道')[u'文章长度'].agg([np.min, np.max])#agg功能跟apply一样,agg允许多个函数执行# 查看测试数据 test_data = https://www.it610.com/article/pd.read_csv('data/sohu_test.txt', sep='\t', header=None, dtype=np.str_, encoding='utf8', names=[u'频道', u'文章']) test_data.head()#没给数默认是5# 查看每个频道下文章数量 test_data.groupby(u'频道')[u'频道'].count()# 查看每个频道下最短、最长文章字数 test_data[u'文章长度'] = train_data[u'文章'].apply(len) test_data.groupby(u'频道')[u'文章长度'].agg([np.min, np.max])

用TF-IDF和词袋表示文档特征

import jieba from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer import pandas as pdcontents = [ '小明喜欢看电影，小红也喜欢看电影。', '小明还喜欢看足球比赛。' ]#就是我们的语料库 stopwords={',','。'}

使用 CounterVectorizer 和 TfidfTransformer 计算 TF-IDF

# 计算TF（每个词的出现次数，未归一） # tokenizer: 定义一个函数，接受文本，返回分词的list # stop_words: 定义停用词词典，会在结果中删除词典中包含的词 tf = CountVectorizer(tokenizer=jieba.lcut, stop_words=stopwords) res1 = tf.fit_transform(contents)#contents中是两句话结果就是两个向量 res1#这是一个稀疏矩阵稀疏矩阵保存数据更节省空间res1.toarray()#调用fit_transform方法输入数据并转换（注意返回格式，利用toarray()进行sparse矩阵转换array数组）# 查看词汇对应关系表示的就是我们所有句子中出现的词 tf.vocabulary_#词典加_就是属性# 查看TF结果 pd.DataFrame(res1.toarray(), columns=[x[0] for x in sorted(tf.vocabulary_.items(), key=lambda x: x[1])])# use_idf: 表示在TF矩阵的基础上计算IDF，并相乘得到TF-IDF # smooth_idf: 表示计算IDF时，分子上的总文档数+1 # sublinear_tf: 表示使用 1+log(tf)替换原来的tf # norm: 表示对TF-IDF矩阵的每一行使用l2范数正则化(变成单位向量模长为1)l1范数就是绝对值的和max正则化---最大值 tfidf = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False) res2=tfidf.fit_transform(res1)#res1是前面tf转换的结果 res2.toarray()#得到的就是tfidf向量res2.toarray().dot(res2.toarray().T) tf.vocabulary_.items() # 查看每个词的IDF值 tfidf.idf_ # 查看TF-IDF结果 pd.DataFrame(res2.toarray(),columns=[x[0] for x in sorted(tf.vocabulary_.items(),key=lambda x:x[1])])

直接使用 TfidfVectorizer

# 参数为 CounterVectorizer 和 TfidfTransformer 的所有参数 tfidf=TfidfVectorizer(tokenizer=jieba.lcut,stop_words=stopwords,norm='l2',use_idf=True,smooth_idf=True,sublinear_tf=False) res=tfidf.fit_transform(contents)#直接对文档进行转换提取tfidf特征 res.toarray()#一步就得到了tfidf向量# 查看每一列所代表的词 tfidf.vocabulary_# 查看每个词的IDF，顺序和 tfidf.vocabulary_ 对应 tfidf.idf_pd.DataFrame({'词':[x[0] for x in sorted(tfidf.vocabulary_.items(),key=lambda x:x[1])],'IDF':tfidf.idf_},columns=['词','IDF'])pd.DataFrame(res.toarray(),columns=[x[0] for x in sorted(tf.vocabulary_.items(),key=lambda x:x[1])])

完整过程_词袋模型对文档进行自动分类——使用词袋模型

#词袋不关注词的先后顺序---词袋模型(bow--一元模型)bag ofwords # 二元模型小明喜欢喜欢小明 #n-gram# 创建输出目录保存训练好的模型 import os output_dir = u'output' if not os.path.exists(output_dir): os.mkdir(output_dir)1.加载数据 import numpy as np import pandas as pd # 查看训练数据 train_data = https://www.it610.com/article/pd.read_csv('data/sohu_train.txt', sep='\t', header=None, dtype=np.str_, encoding='utf8', names=[u'频道', u'文章']) train_data.head() # 载入停用词 stopwords = set() with open('data/stopwords.txt', 'r') as infile: for line in infile: line = line.rstrip('\n') if line: stopwords.add(line.lower())2. 计算每个文章的tfidf特征 import jieba from sklearn.feature_extraction.text import TfidfVectorizer #min_df去掉df值小的词这样的词一般是非常专业的名词或者是生僻词是噪音 #max_df 去掉df值很大的词这样词是常用词去掉不要 tfidf = TfidfVectorizer(tokenizer=jieba.lcut, stop_words=stopwords, min_df=50, max_df=0.3) x = tfidf.fit_transform(train_data[u'文章'])print(u'词表大小: {}'.format(len(tfidf.vocabulary_)))#24939个词24000x249393.训练分类器 # 编码目标变量因为咱们的标签是字符串sklearn只接受数值 from sklearn.preprocessing import LabelEncoder y_encoder = LabelEncoder() y = y_encoder.fit_transform(train_data[u'频道'])#将类别转换成0,1,2,3,4,5,6,7,8,9... y[:10]# 编码X变量 #x = tfidf.transform(train_data[u'文章']) # 划分训练测试数据 from sklearn.model_selection import train_test_split # 根据y分层抽样，测试数据占20% #因为现在数据量很大此时采用对下标进行分割 train_idx, test_idx = train_test_split(range(len(y)), test_size=0.2, stratify=y) train_x = x[train_idx, :]#引用不是复制 train_y = y[train_idx] test_x = x[test_idx, :] test_y = y[test_idx]# 训练逻辑回归模型我们是12分类12种不同的新闻属于多分类 from sklearn.linear_model import LogisticRegression # 常用参数说明 # penalty: 正则项类型，l1还是l2 # C: 正则项惩罚系数的倒数，越大则惩罚越小 # fit_intercept: 是否拟合常数项 # max_iter: 最大迭代次数 # multi_class: 以何种方式训练多分类模型 #ovr = 对每个标签训练二分类模型 #multinomial ovo = 直接训练多分类模型，仅当solver={newton-cg, sag, lbfgs}时支持 # solver: 用哪种方法求解，可选有{liblinear, newton-cg, sag, lbfgs} #小数据liblinear比较好，大数据量sag更快 #多分类问题，liblinear只支持ovr模式，其他支持ovr和multinomial #liblinear支持l1正则，其他只支持l2正则 model = LogisticRegression(multi_class='multinomial', solver='lbfgs') model.fit(train_x, train_y)4.模型效果 from sklearn.metrics import confusion_matrix, precision_recall_fscore_support # 在测试集上计算模型的表现 test_y_pred = model.predict(test_x)# 计算混淆矩阵 pd.DataFrame(confusion_matrix(test_y, test_y_pred), columns=y_encoder.classes_, index=y_encoder.classes_)# 计算各项评价指标 def eval_model(y_true, y_pred, labels): # 计算每个分类的Precision, Recall, f1, support p, r, f1, s = precision_recall_fscore_support(y_true, y_pred) # 计算总体的平均Precision, Recall, f1, support tot_p = np.average(p, weights=s) tot_r = np.average(r, weights=s) tot_f1 = np.average(f1, weights=s) tot_s = np.sum(s) res1 = pd.DataFrame({ u'Label': labels, u'Precision': p, u'Recall': r, u'F1': f1, u'Support': s }) res2 = pd.DataFrame({ u'Label': [u'总体'], u'Precision': [tot_p], u'Recall': [tot_r], u'F1': [tot_f1], u'Support': [tot_s] }) res2.index = [999] res = pd.concat([res1, res2]) return res[[u'Label', u'Precision', u'Recall', u'F1', u'Support']]eval_model(test_y, test_y_pred, y_encoder.classes_)5.模型保存 # 保存模型到文件pip install dill #注意我们要把tfidf特征提取模型保存标签转换模型预测模型 import dill import pickle model_file = os.path.join(output_dir, u'model.pkl') with open(model_file, 'wb') as outfile: dill.dump({ 'y_encoder': y_encoder, 'tfidf': tfidf, 'lr': model }, outfile)

对新文档预测

6.加载新文档 # 加载新文档数据 new_data = https://www.it610.com/article/pd.read_csv('data/sohu_test.txt', sep='\t', header=None, dtype=np.str_, encoding='utf8', names=[u'频道', u'文章']) new_data.head()# 加载模型 import pickle model_file = os.path.join(output_dir, u'model.pkl') with open(model_file, 'rb') as infile: model = pickle.load(infile)# 对新文档预测（这里只对前10篇预测） # 1. 转化为词袋表示 new_x = model['tfidf'].transform(new_data[u'文章'][:10])# 2. 预测类别 new_y_pred = model['lr'].predict(new_x) new_y_pred# 3. 解释类别 pd.DataFrame({u'预测频道': model['y_encoder'].inverse_transform(new_y_pred), u'实际频道': new_data[u'频道'][:10]})

训练word2vec模型.ipynb

# 创建输出目录用来保存训练好的词向量 output_dir='output_word2vec' import os if not os.path.exists(output_dir): os.mkdir(output_dir)1.导入数据 import numpy as np import pandas as pd# 查看训练数据 train_data=https://www.it610.com/article/pd.read_csv('data/sohu_train.txt',sep='\t',header=None,dtype=np.str_,encoding='utf8',names=['频道','文章']) train_data.info()# 载入停用词 stopwords = set() with open('data/stopwords.txt', 'r') as infile: for line in infile: line = line.rstrip('\n') if line: stopwords.add(line.lower())2.分词 # 分词 import jieba article_words=[] # 遍历每篇文章 for article in train_data[u'文章']: curr_words=[] # 遍历文章中的每个词 for word in jieba.cut(article): # 去除停用词 if word not in stopwords: curr_words.append(word) article_words.append(curr_words)# 分词结果存储到文件 seg_word_file=os.path.join(output_dir,'seg_words.txt') with open(seg_word_file,'wb') as outfile: for words in article_words: outfile.write(u' '.join(words).encode('utf8') + b'\n') print('分词结果保存到文件:{}'.format(seg_word_file))3.训练word2vec模型 #pip install gensim from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence # 创建一个句子迭代器，一行为一个句子，词和词之间用空格分开 # 这里我们把一篇文章当作一个句子 sentences=LineSentence(seg_word_file)# 训练word2vec模型 # 参数说明： # sentences: 包含句子的list，或迭代器 # size: 词向量的维数，size越大需要越多的训练数据，同时能得到更好的模型 # alpha: 初始学习速率，随着训练过程递减，最后降到 min_alpha # window: 上下文窗口大小，即预测当前这个词的时候最多使用距离为window大小的词 # max_vocab_size: 词表大小，如果实际词的数量超过了这个值，过滤那些频率低的 # workers: 并行度 # iter: 训练轮数 # min_count: 忽略出现次数小于该值的词 model=Word2Vec(sentences=sentences,size=100,iter=10,min_count=20)# 保存模型 model_file = os.path.join(output_dir, 'model.w2v') model.save(model_file)

word2vec模型的使用

# 读取模型 model_file = os.path.join(output_dir, 'model.w2v') model2=Word2Vec.load(model_file)1 查找语义相近的词 # def invest_similar(*args, **kwargs): #res = model2.most_similar(*args, **kwargs) #print('\n'.join([u'{}:{}'.format(x[0], x[1]) for x in res])) def invest_similar(*args,**kwargs): res=model2.most_similar(*args,**kwargs) print('\n'.join(['{}:{}'.format(x[0],x[1]) for x in res])) invest_similar(u'摄影', topn=5)# 女人 + 先生 - 男人 = 女士 # 先生 - 女士 = 男人 - 女人，这个向量的方向就代表了性别! invest_similar(positive=[u'女人', u'先生'], negative=[u'男人'], topn=1)2 计算两个词的相似度 model2.similarity('摄影','摄像')3 查询某个词的词向量 model2[u'摄影'].shape model2[u'摄影']

完整过程_word2vec模型

# 创建输出目录 import os output_dir = u'output_w2v' if not os.path.exists(output_dir): os.mkdir(output_dir)1 加载数据 import numpy as np import pandas as pd # 查看训练数据 train_data = https://www.it610.com/article/pd.read_csv('data/sohu_train.txt', sep='\t', header=None, dtype=np.str_, encoding='utf8', names=[u'频道', u'文章']) train_data.head() # 载入停用词 stopwords = set() with open('data/stopwords.txt', 'r') as infile: for line in infile: line = line.rstrip('\n') if line: stopwords.add(line.lower())2 计算每个文章的词向量 # 加载训练好的Word2Vec模型 # 需要 4.0_训练word2vec模型.ipynb 的执行结果 from gensim.models import Word2Vec w2v = Word2Vec.load('output_word2vec/model.w2v')# 使用文章中所有词的平均词向量作为文章的向量 import jieba def compute_doc_vec_single(article): vec = np.zeros((w2v.layer1_size,), dtype=np.float32) n = 0 for word in jieba.cut(article): if word in w2v: vec += w2v[word]#求所有词向量的和 n += 1#计算词的个数 return vec / n#求平均值def compute_doc_vec(articles): return np.row_stack([compute_doc_vec_single(x) for x in articles])x = compute_doc_vec(train_data[u'文章'])3 训练分类器 # 编码目标变量 from sklearn.preprocessing import LabelEncoder y_encoder = LabelEncoder() y = y_encoder.fit_transform(train_data[u'频道'])# 划分训练测试数据 from sklearn.model_selection import train_test_split # 根据y分层抽样，测试数据占20% train_idx, test_idx = train_test_split(range(len(y)), test_size=0.2, stratify=y) train_x = x[train_idx, :] train_y = y[train_idx] test_x = x[test_idx, :] test_y = y[test_idx]# 训练逻辑回归模型 from sklearn.linear_model import LogisticRegression # 常用参数说明 # penalty: 正则项类型，l1还是l2 # C: 正则项惩罚系数的倒数，越大则惩罚越小 # fit_intercept: 是否拟合常数项 # max_iter: 最大迭代次数 # multi_class: 以何种方式训练多分类模型 #ovr = 对每个标签训练二分类模型 #multinomial = 直接训练多分类模型，仅当solver={newton-cg, sag, lbfgs}时支持 # solver: 用哪种方法求解，可选有{liblinear, newton-cg, sag, lbfgs} #小数据liblinear比较好，大数据量sag更快 #多分类问题，liblinear只支持ovr模式，其他支持ovr和multinomial #liblinear支持l1正则，其他只支持l2正则 model = LogisticRegression(multi_class='multinomial', solver='lbfgs') model.fit(train_x, train_y)4 模型效果评估 from sklearn.metrics import confusion_matrix, precision_recall_fscore_support # 在测试集上计算模型的表现 test_y_pred = model.predict(test_x) # 计算混淆矩阵 pd.DataFrame(confusion_matrix(test_y, test_y_pred), columns=y_encoder.classes_, index=y_encoder.classes_)# 计算各项评价指标 def eval_model(y_true, y_pred, labels): # 计算每个分类的Precision, Recall, f1, support p, r, f1, s = precision_recall_fscore_support(y_true, y_pred) # 计算总体的平均Precision, Recall, f1, support tot_p = np.average(p, weights=s) tot_r = np.average(r, weights=s) tot_f1 = np.average(f1, weights=s) tot_s = np.sum(s) res1 = pd.DataFrame({ u'Label': labels, u'Precision': p, u'Recall': r, u'F1': f1, u'Support': s }) res2 = pd.DataFrame({ u'Label': [u'总体'], u'Precision': [tot_p], u'Recall': [tot_r], u'F1': [tot_f1], u'Support': [tot_s] }) res2.index = [999] res = pd.concat([res1, res2]) return res[[u'Label', u'Precision', u'Recall', u'F1', u'Support']]eval_model(test_y, test_y_pred, y_encoder.classes_)5 模型保存 # 保存模型到文件 import dill import pickle model_file = os.path.join(output_dir, u'model.pkl') with open(model_file, 'wb') as outfile: pickle.dump({ 'y_encoder': y_encoder, 'lr': model }, outfile)6 对新文档预测 from gensim.models import Word2Vec import dill import pickle import jieba# 把预测相关的逻辑封装在一个类中，使用这个类的实例来对新文档进行分类预测 class Predictor(object):def __init__(self, w2v_model_file, lr_model_file): self.w2v = Word2Vec.load(w2v_model_file) with open(lr_model_file, 'rb') as infile: self.model = pickle.load(infile)def predict(self, articles): x = self._compute_doc_vec(articles) y = self.model['lr'].predict(x) y_label = self.model['y_encoder'].inverse_transform(y) return y_labeldef _compute_doc_vec(self, articles): return np.row_stack([compute_doc_vec_single(x) for x in articles])def _compute_doc_vec_single(self, article): vec = np.zeros((w2v.layer1_size,), dtype=np.float32) n = 0 for word in jieba.cut(article): if word in w2v: vec += w2v[word] n += 1 return vec / n# 加载新文档数据 new_data = https://www.it610.com/article/pd.read_csv('data/sohu_test.txt', sep='\t', header=None, dtype=np.str_, encoding='utf8', names=[u'频道', u'文章']) new_data.head()# 加载模型 predictor = Predictor('output_word2vec/model.w2v', model_file) # 预测前10篇的分类 new_y_pred = predictor.predict(new_data[u'文章'][:10]) # 对比预测 pd.DataFrame({u'预测频道': new_y_pred, u'实际频道': new_data[u'频道'][:10]})

项目集成 main.py

from flask import Flask from flask import render_template from flask import request from predicter import TfidfPredicter,Word2vecPredictor#模型路径 tfidf_model_file='../output/model.pkl' word2vec_file='../output_word2vec/model.w2v' word2vec_model_file='../output_w2v/model.pkl'tfidf_predicter = TfidfPredicter(tfidf_model_file)#加载tfidf模型 word2vec_predicter = Word2vecPredictor(word2vec_file,word2vec_model_file)#加载word2vec模型app = Flask(__name__)@app.route('/') def newsclass(): """ 显示文章预测页面 :return: """ returnrender_template('newsclass.html')@app.route('/predict',methods=["GET", "POST"]) def predict(): """ 接受前端传递来的文章内容和预测方式,并用对应的预测方式对文章类型进行预测 :return: """#接受前端传过来的新闻内容 if request.method == "POST": news = request.form.get("news") model_type=request.form.get("type") else: news = request.args.get("news") model_type = request.args.get("type")#判断用户选择的预测方式并采用对应的方式进行预测 if model_type=='tfidf': labels=tfidf_predicter.predict([news]) else: labels=word2vec_predicter.predict([news])return labels[0]#由于每次只传递了一个新闻if __name__=='__main__': app.run(host='0.0.0.0',debug=True)

predicter.py

from gensim.models import Word2Vec import pickle import jieba import numpy as npclass Word2vecPredictor(object): """ 把word2vec预测相关的逻辑封装在一个类中，使用这个类的实例来对新文档进行分类预测 """def __init__(self, w2v_model_file, lr_model_file): """ 加载词向量和预测模型 :param w2v_model_file: 词向量文件路径 :param lr_model_file: 模型文件路径 """ # 导入预训练的词向量 self.w2v = Word2Vec.load(w2v_model_file) with open(lr_model_file, 'rb') as infile: # 导入训练好的模型 self.model = pickle.load(infile)def predict(self, articles): """ 对文章类型进行预测 :param articles: 文章列表 :return:文章预测的结果列表 """ # 1. 计算所有文章的向量 x = self._compute_doc_vec(articles) # 2. 预测类别 y = self.model['lr'].predict(x) # 3. 将预测的新闻下标转换成对应的新闻类别名称 y_label = self.model['y_encoder'].inverse_transform(y) return y_labeldef _compute_doc_vec(self, articles): """ 计算所有文章的词向量 :param articles: 文章列表 :return:所有文章的向量 """ return np.row_stack([self._compute_doc_vec_single(x) for x in articles])def _compute_doc_vec_single(self, article): """ 计算单个文章的词向量 :param article: 一篇文章的类容 :return:一篇文章的向量 """ vec = np.zeros((self.w2v.layer1_size,), dtype=np.float32) n = 0 for word in jieba.cut(article): if word in self.w2v:#判断词在不在词向量中 vec += self.w2v[word]#词向量累积求和 n += 1#计算词的个数 return vec / n#求平均词向量作为文章的向量class TfidfPredicter(object): """ 把tfidf预测相关的逻辑封装在一个类中，使用这个类的实例来对新文档进行分类预测 """def __init__(self,model_file): """ 实例化并加载模型 :param model_file: 模型路径 """ with open(model_file, 'rb') as infile: self.model = pickle.load(infile)def predict(self,articles): """ 实现文章预测 :param articles: 文章列表 :return: 文章预测的结果列表 """ # 对新闻预测 # 1. 转化为词袋表示得到tfidf特征 x = self.model['tfidf'].transform(articles)# 2. 预测类别是0,1,2,3,4数值 y= self.model['lr'].predict(x)# 3. 将预测的新闻下标转换成对应的新闻类别名称 y_label = self.model['y_encoder'].inverse_transform(y) return y_label