深度学习|基于keras深度学习模型新闻标签一二级分类

新闻标签一二级深度学习分类模型
对一篇新闻的标题/正文/来源数据进行建模分析,以及标签正确率90%,二级标签正确率72%,解决一二级标签不一致的情况在这里插入代码片
导入所需的包 import tensorflow as tf
import pandas as pd
import numpy as np
from keras_bert import Tokenizer
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras import backend as K
from scipy import sparse
from tensorflow.keras.optimizers import Adam
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import json
import argparse
“”"
模型需求:一二级标签分类需求
时间:2020/12
模型描述:运用textCNN深度学习模型搭建而成
“”"
def parse_args():
“”“解析参数.”""
parser = argparse.ArgumentParser(usage=“it’s usage tip.”,
description=“predict news type”)
parser.add_argument("–label-sample-num", default=10000, help=“每个标签样本数量”)
parser.add_argument("–embedding-size", default=128, type=int,
help=“字向量嵌入规格”)
parser.add_argument("–max-title-size", default=32, type=int, help=“最大入参标题字数”)
parser.add_argument("–max-content-size", default=512, type=int,
help=“最大文章内容字数”)
parser.add_argument("–max-source-size", default=32, type=int, help=“最大文章来源字数”)
parser.add_argument("–batch-size", default=16, type=int,
help=“模型训练每个批次样本数量”)
parser.add_argument("–epochs", default=5, type=int,
help=“模型训练次数”)
parser.add_argument("–model-save-path", type=str, help=“模型存储路径”)
parser.add_argument("–model-desc-save-path", type=str, help=“模型描述文件存储路径”)
parser.add_argument("–con1-size", default=128, type=int,
help=“第一层卷积规格”)
parser.add_argument("–con2-size", default=64, type=int, help=“第二层卷积规格”)
parser.add_argument("–dense-size", default=128, type=int,
help=“全链接规格”)
parser.add_argument("–learning-rate", default=0.001, type=float,
help=“模型学习率”)
return parser.parse_args()
创建模型类 class TextCNN(object):
“”"文本深度学习标签分类模型

新闻文章的标题/来源/正文作为入参字段,经过数据编码处理深度学习然后输出文章一二级标签分类参数: vocab: 字向量字典 embedding_size: 相关字嵌入的规格 max_title: 标题最长取值 max_content: 内容最长取值 max_source: 来源最长取值 first_class_num: 一级标签数量 second_class_num: 二级标签数量返回: model1, model2: 返回训练好的模型 """FIRST_CLASS = "first_class" SECOND_CLASS = "second_class"def __init__(self, vocab, embedding_size, max_title, max_content, max_source, first_class_num, second_class_num): self.vocab = vocab self.max_title = max_title self.max_content = max_content self.max_source = max_source self.tokenizer, self.vocab_size = self.__get_tokenizer() self.embedding_size = embedding_size self.first_class_num = first_class_num self.second_class_num = second_class_numdef __get_tokenizer(self): """ 建立字对应的字典索引,返回每个字及对应的索引号 """ token_dict = {} with open(self.vocab, 'r', encoding='utf-8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) return Tokenizer(token_dict), len(token_dict)def get_tokenizer(self): return self.tokenizerdef encode(self, text, max_len): """ 输入文本,按最长取值截取,返回每个字对应的索引,不足的以0填充 """ return self.tokenizer.encode(first=text, max_len=max_len)[0]def get_model(self, class_map, con1_size, con2_size, dense_size, learning_rate): """ 输入样本的标题/来源/内容进行深度学习,返回文章一二级标签分类 """ title = Input(shape=(self.max_title,), name='title_ids', dtype=tf.float32) content = Input(shape=(self.max_content,), name='content_ids', dtype=tf.float32) source = Input(shape=(self.max_source,), name='source_ids', dtype=tf.float32) embedding_layer = Embedding(self.vocab_size + 1, self.embedding_size) mask_layer = Embedding(self.first_class_num, self.second_class_num, weights=[class_map], trainable=False)# 消除一致性问题 embedding_title = embedding_layer(title) embedding_content = embedding_layer(content) embedding_source = embedding_layer(source) flat_layers = [] for embedding_layer in [embedding_title, embedding_content, embedding_source]: layers = [] for i in [3, 5, 7]: conv = Conv1D(con1_size, i, padding='same', strides=1, activation='relu')(embedding_layer) pool = MaxPooling1D(pool_size=3, padding='same')(conv) conv = Conv1D(con2_size, i, padding='same', strides=1, activation='relu')(pool) pool = MaxPooling1D(pool_size=3, padding='same')(conv) layers += [pool] flat = Flatten()(concatenate(layers, axis=-1)) flat = Activation("relu")(BatchNormalization()(Dense(dense_size, activation=None)(flat))) flat_layers += [flat] flat_concat = concatenate(flat_layers, axis=-1) dense = Activation("relu")(BatchNormalization()(Dense(dense_size, activation=None)(flat_concat))) output_first = Dense(self.first_class_num, activation='softmax')(dense) first_class_value = https://www.it610.com/article/Lambda(lambda x: K.argmax(x), name=self.FIRST_CLASS)(output_first) mask = mask_layer(first_class_value) second = Dense(self.second_class_num, activation=None)(dense) second = Multiply()([second, mask]) output_second = Activation("softmax")(second) second_class_value = https://www.it610.com/article/Lambda(lambda x: K.argmax(x), name=self.SECOND_CLASS)(output_second) model1 = Model(inputs=[title, content, source], outputs=[output_first, output_second]) model1.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=learning_rate), metrics=['accuracy']) model2 = Model(inputs=[title, content, source], outputs=[first_class_value, second_class_value]) model2.summary() return model1, model2

定义模型描述类 class ModelDescription(object):
“”"分类模型描述文件
描述分类模型的入参出参等字段参数: dim: 数据维度 map_key: 模型入参字段和输入字段对应字段 data_type: 模型入参字段名 handler: 处理器 fill_value: 字段不足时填充值返回: model: 模型描述文件字典 """model = {}def __init__(self): self.model['model_desc'] = {} self.model['model_desc']['signature_name'] = "" self.model['model_desc']['inputs'] = {} self.model['model_desc']['outputs'] = [] passdef build_context_field(self, dim, map_key, tensor_name, data_type="int", handler="tokenizer", fill_value=https://www.it610.com/article/0): field = {'dim': dim, 'map_key': map_key, 'tensor_name': tensor_name, 'data_type': data_type, 'handler': handler, 'fill_value': fill_value} return fielddef build_source(self, len, tensor_name): return self.build_context_field(len, "source", tensor_name)def build_title(self, len, tensor_name): return self.build_context_field(len, "title", tensor_name)def build_content(self, len, tensor_name): return self.build_context_field(len, "content", tensor_name)def set_context(self, source_len, source_tensor_name, title_len, title_tensor_name, content_len, content_tensor_name): source = self.build_source(source_len, source_tensor_name) title = self.build_title(title_len, title_tensor_name) content = self.build_content(content_len, content_tensor_name) self.model['model_desc']['inputs']['context'] = [source, title, content]def add_out_put(self, map_key, tensor_name, tag_name): output = {"map_key": map_key, "tensor_name": tensor_name, "data_type": "int", "handler": "tags", "tag_name": tag_name, "fill_value": "0", "dim": -1} self.model['model_desc']['outputs'] = self.model['model_desc']['outputs'] + [output]def to_json(self): return json.dumps(self.model, ensure_ascii=False)

def news_classify_algo():
# 调用pyspark并获取sample数据
spark = SparkSession
.builder
.config(“spark.sql.broadcastTimeout”, “3000”)
.master(“yarn”)
.enableHiveSupport()
.getOrCreate()
args = parse_args()# 从HIVE表读取数据 sql = '''select news_id,title,content,type,source,content_type,first_label,second_label from dp_dm.content_center_news_classify_sample_data where rank <={}'''.format(args.label_sample_num) news_sample = spark.sql(sql).toPandas()# 简单过滤缺失/重复数据 news = news_sample[ ['news_id', 'title', 'content', 'first_label', 'source', 'second_label']].dropna().drop_duplicates()# 建立类别索引 category = sorted(np.unique(news['first_label'].dropna().values)) sub_category = sorted(np.unique(news['second_label'].dropna().values)) category_map = dict(zip(category, np.arange(len(category)))) sub_category_map = dict(zip(sub_category, np.arange(len(sub_category)))) # 初始化模型 text_cnn = TextCNN(vocab="vocab.txt", embedding_size=args.embedding_size, max_title=args.max_title_size, max_content=args.max_content_size, max_source=args.max_source_size , first_class_num=len(category), second_class_num=len(sub_category))# 对类别和特征进行编码 news['category'] = news['first_label'].map(category_map) news['sub_category'] = news['second_label'].map(sub_category_map) news['title_ids'] = news['title'].apply(lambda x: text_cnn.encode(x, text_cnn.max_title)) news['content_ids'] = news['content'].apply(lambda x: text_cnn.encode(x, text_cnn.max_content)) news['source_ids'] = news['source'].apply(lambda x: text_cnn.encode(x, text_cnn.max_source))# 建立一二级标签映射字典 category_level_reverse_map = dict(zip(news['sub_category'], news['category']))# 切分数据集为训练集和验证集 train_x, test_x, train_y, test_y = train_test_split(news[['title_ids', 'content_ids', 'source_ids']] , news[['category', 'sub_category']])# 建立分类矩阵 def get_class_matrix(class_dict): data = https://www.it610.com/article/np.ones(len(class_dict)) indice = list(class_dict.values()) indictor = list(class_dict.keys()) map_mat = sparse.csr_matrix((data, (indice, indictor))).todense() return map_mat# 确立数据集x,y值 tx_title = np.array(train_x['title_ids'].values.tolist()).astype(np.float32) tx_content = np.array(train_x['content_ids'].values.tolist()).astype(np.float32) tx_source = np.array(train_x['source_ids'].values.tolist()).astype(np.float32) tx = [tx_title, tx_content, tx_source] ty_cate = np.array(train_y['category'].values.tolist()).astype(np.float32) ty_subcate = np.array(train_y['sub_category'].values.tolist()).astype(np.float32) ty = [ty_cate, ty_subcate] ex_title = np.array(test_x['title_ids'].values.tolist()).astype(np.float32) ex_content = np.array(test_x['content_ids'].values.tolist()).astype(np.float32) ex_source = np.array(test_x['source_ids'].values.tolist()).astype(np.float32) ex = [ex_title, ex_content, ex_source] ey_cate = np.array(test_y['category'].values.tolist()).astype(np.float32) ey_subcate = np.array(test_y['sub_category'].values.tolist()).astype(np.float32) ey = [ey_cate, ey_subcate]model1, model2 = text_cnn.get_model(get_class_matrix(category_level_reverse_map), args.con1_size, args.con2_size, args.dense_size, args.learning_rate)# 模型训练 model1.fit(x=tx, y=ty, batch_size=args.batch_size, validation_data=https://www.it610.com/article/(ex, ey), epochs=args.epochs)# 保存模型到HDFS model2.save(args.model_save_path)# 以下是为了保存模型描述文件 news_model = ModelDescription() news_model.set_context(args.max_source_size,'source_ids', args.max_title_size, 'title_ids', args.max_content_size, 'content_ids') news_model.add_out_put('一级标签', text_cnn.FIRST_CLASS, list(category_map.keys())) news_model.add_out_put('二级标签', text_cnn.SECOND_CLASS, list(sub_category_map.keys())) sc = spark.sparkContext fs_class = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem conf_class = sc._gateway.jvm.org.apache.hadoop.conf.Configuration fs = fs_class.get(conf_class()) path_class = sc._gateway.jvm.org.apache.hadoop.fs.Pathdef save_file(path: str, data: str): """保存文件至hdfs. 参数: path(str): hdfs上的路径 data(str): 数据 """ output = fs.create(path_class(path)) output.write(data.encode()) output.flush() output.close()# 保存描述文件到HDFS data = https://www.it610.com/article/news_model.to_json() save_file(args.model_desc_save_path, data)

主函数入口 【深度学习|基于keras深度学习模型新闻标签一二级分类】if name == ‘main’:
news_classify_algo()

    推荐阅读