【自然语言处理|新闻文本分类之旅 Word2Vec_CNN_GRU】
天池-零基础入门NLP
- 新闻文本分类
- 导入相关库
- 读取数据
- 数据预处理
- 自定义模型
- 输出上传文件
新闻文本分类 导入相关库
import numpy as np
import pandas as pd
from gensim.models import word2vec
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
%pylab inline
读取数据
train_df = pd.read_csv('../data/train_set.csv', sep='\t')
test_df = pd.read_csv('../data/test_a.csv', sep='\t')
数据预处理 将文本数据预处理,转换成词向量
2967 6758 339 2021 1854 3731 4109 3792 4149
转换成
[-0.04762661, -0.11038123,...,-0.00834203],
[-0.01352869, -0.13543403,...,-0.02658689],
...,
[7.74508417e-02, 6.12210967e-02,...,7.56272748e-02]
- 将文本数据转换成序列
注意:MAX_SEQUENCE_LENGTH 越大越吃内存
# 加载Word2Vec模型
w2v_model = word2vec.Word2Vec.load('../emb/word2vec.h5')# 转换成序列
# MAX_SEQUENCE_LENGTH = 1024
MAX_SEQUENCE_LENGTH = 50
input_categories = 'text'
list_train = list(train_df['text'].map(lambda x:x.split(' ')))
list_train_ = tf.keras.preprocessing.sequence.pad_sequences(list_train,
padding='post', truncating='post', maxlen=MAX_SEQUENCE_LENGTH)
del list_train
list_test = list(test_df['text'].map(lambda x:x.split(' ')))
list_test_ = tf.keras.preprocessing.sequence.pad_sequences(list_test,
padding='post', truncating='post', maxlen=MAX_SEQUENCE_LENGTH)
del list_test
- 将序列转换成词向量
# 把词转换成word2vec的词向量
def embedding_sentences(sentences, w2vModel):
all_vectors = []
embeddingDim = w2vModel.vector_size
embeddingUnknown = [0 for i in range(embeddingDim)]
for sentence in sentences:
this_vector = []
for word in sentence:
word = str(word)
if word in w2vModel.wv.vocab:
this_vector.append(w2vModel[word])
else:
this_vector.append(embeddingUnknown)
all_vectors.append(this_vector)
return all_vectors
inputs = np.array(embedding_sentences(list_train_, w2v_model))
test_inputs = np.array(embedding_sentences(list_test_, w2v_model))
- 类别标签转换
output_categories = 'label'
def compute_output_arrays(df, columns):
return np.asarray(df[columns].astype(int))
outputs = compute_output_arrays(train_df, output_categories)
自定义模型
- 用 Focal Loss 自定义损失函数
def Focal_Loss(y_true, y_pred, alpha=0.5, gamma=2):
y_pred += tf.keras.backend.epsilon()
ce = -y_true * tf.math.log(y_pred)
weight = tf.pow(1 - y_pred, gamma) * y_true
fl = ce * weight * alpha
reduce_fl = tf.keras.backend.max(fl, axis=-1)
return reduce_fl
- 自定义模型
def create_model(embedding_dims, max_len, num_class):
tensor_input = tf.keras.Input(shape=(max_len, embedding_dims))
cnn1 = tf.keras.layers.SeparableConv1D(256, 3, padding='same', strides = 1, activation='relu')(tensor_input)
cnn1 = tf.keras.layers.BatchNormalization()(cnn1)
cnn1 = tf.keras.layers.MaxPool1D(pool_size=max_len)(cnn1)
cnn2 = tf.keras.layers.SeparableConv1D(256, 4, padding='same', strides = 1, activation='relu')(tensor_input)
cnn2 = tf.keras.layers.BatchNormalization()(cnn2)
cnn2 = tf.keras.layers.MaxPool1D(pool_size=max_len)(cnn2)
cnn3 = tf.keras.layers.SeparableConv1D(256, 5, padding='same', strides = 1, activation='relu')(tensor_input)
cnn3 = tf.keras.layers.BatchNormalization()(cnn3)
cnn3 = tf.keras.layers.MaxPool1D(pool_size=max_len)(cnn3)
cnn = tf.keras.layers.concatenate([cnn1,cnn2,cnn3], axis=-1)
x = tf.keras.layers.Dropout(0.2)(cnn)
x = tf.keras.layers.Flatten()(x)x = tf.reshape(x,(-1,1,x.shape[-1]))
x = tf.keras.layers.Dense(768, activation='relu')(x)
x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(x)
x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(x1)
x1 = tf.keras.layers.Dense(32, activation='relu')(x1)
x2 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True))(x)
x2 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x2)
x2 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32))(x2)
x2 = tf.keras.layers.Dense(16, activation='relu')(x2)
x = tf.keras.layers.Concatenate()([x1, x2])
x = tf.keras.layers.Dropout(0.2)(x)
tensor_output = tf.keras.layers.Dense(num_class, activation='softmax')(x)model = tf.keras.models.Model(tensor_input, outputs=tensor_output)
# model.summary() 查看模型
optimizer = tf.keras.optimizers.Nadam(learning_rate=1e-5)
FL = lambda y_true,y_pred: Focal_Loss(y_true, y_pred, alpha=0.25, gamma=2)
model.compile(loss=FL, optimizer=optimizer, metrics=['acc'])
return model
- 5折训练 + 预测取平均
gkf = StratifiedKFold(n_splits=5).split(X=train_df[input_categories].fillna('13'), y=train_df[output_categories].fillna('13'))test_preds = []
for fold, (train_idx, valid_idx) in enumerate(gkf):
train_inputs = inputs[train_idx]
train_outputs = to_categorical(outputs[train_idx])valid_inputs = inputs[valid_idx]
valid_outputs = to_categorical(outputs[valid_idx])K.clear_session()# 销毁当前的TF图并创建一个新图,有助于避免旧模型/图层混乱。
# 模型构建
model = create_model(embedding_dims=120, max_len=MAX_SEQUENCE_LENGTH, num_class=14)
model.fit(train_inputs, train_outputs, validation_data= https://www.it610.com/article/[valid_inputs, valid_outputs], epochs=7, batch_size=16)
# 测试集预测
test_preds.append(model.predict(test_inputs))
# K折取平均
preds = np.average(test_preds, axis=0)
preds = np.argmax(preds,axis=1)
输出上传文件
submission = pd.read_csv('../data/test_a_sample_submit.csv')
submission['label'] = preds
submission.to_csv('../output/Word2Vec_submission.csv', index=False)
推荐阅读
- 人工智能|hugginface-introduction 案例介绍
- 中文分词预处理之N最短路径法小结(转)
- 深度学习|2019年CS224N课程笔记-Lecture 17:Multitask Learning
- 深度学习|2018年度总结和2019年度计划
- BERT微调做中文文本分类
- 【学习笔记】自然语言处理实践(新闻文本分类)- 基于深度学习的文本分类Bert
- 【学习笔记】自然语言处理实践(新闻文本分类)- 基于深度学习的文本分类Word2Vec
- 自然语言处理|答案选择|语义匹配任务目前表现最好的几个模型
- 深度学习|NLP重铸篇之BERT如何微调文本分类
- NLP实践-Task1