自然语言处理|新闻文本分类之旅 Word2Vec_CNN_GRU

【自然语言处理|新闻文本分类之旅 Word2Vec_CNN_GRU】
天池-零基础入门NLP

  • 新闻文本分类
    • 导入相关库
    • 读取数据
    • 数据预处理
    • 自定义模型
    • 输出上传文件

新闻文本分类 导入相关库
import numpy as np import pandas as pd from gensim.models import word2vec from sklearn.model_selection import StratifiedKFold from tensorflow.keras.utils import to_categorical import tensorflow as tf import warnings warnings.filterwarnings('ignore') %pylab inline

读取数据
train_df = pd.read_csv('../data/train_set.csv', sep='\t') test_df = pd.read_csv('../data/test_a.csv', sep='\t')

数据预处理 将文本数据预处理,转换成词向量
2967 6758 339 2021 1854 3731 4109 3792 4149
转换成
[-0.04762661, -0.11038123,...,-0.00834203],
[-0.01352869, -0.13543403,...,-0.02658689],
...,
[7.74508417e-02, 6.12210967e-02,...,7.56272748e-02]
  • 将文本数据转换成序列
    注意:MAX_SEQUENCE_LENGTH 越大越吃内存
# 加载Word2Vec模型 w2v_model = word2vec.Word2Vec.load('../emb/word2vec.h5')# 转换成序列 # MAX_SEQUENCE_LENGTH = 1024 MAX_SEQUENCE_LENGTH = 50 input_categories = 'text' list_train = list(train_df['text'].map(lambda x:x.split(' '))) list_train_ = tf.keras.preprocessing.sequence.pad_sequences(list_train, padding='post', truncating='post', maxlen=MAX_SEQUENCE_LENGTH) del list_train list_test = list(test_df['text'].map(lambda x:x.split(' '))) list_test_ = tf.keras.preprocessing.sequence.pad_sequences(list_test, padding='post', truncating='post', maxlen=MAX_SEQUENCE_LENGTH) del list_test

  • 将序列转换成词向量
# 把词转换成word2vec的词向量 def embedding_sentences(sentences, w2vModel): all_vectors = [] embeddingDim = w2vModel.vector_size embeddingUnknown = [0 for i in range(embeddingDim)] for sentence in sentences: this_vector = [] for word in sentence: word = str(word) if word in w2vModel.wv.vocab: this_vector.append(w2vModel[word]) else: this_vector.append(embeddingUnknown) all_vectors.append(this_vector) return all_vectors inputs = np.array(embedding_sentences(list_train_, w2v_model)) test_inputs = np.array(embedding_sentences(list_test_, w2v_model))

  • 类别标签转换
output_categories = 'label' def compute_output_arrays(df, columns): return np.asarray(df[columns].astype(int)) outputs = compute_output_arrays(train_df, output_categories)

自定义模型
  • 用 Focal Loss 自定义损失函数
def Focal_Loss(y_true, y_pred, alpha=0.5, gamma=2): y_pred += tf.keras.backend.epsilon() ce = -y_true * tf.math.log(y_pred) weight = tf.pow(1 - y_pred, gamma) * y_true fl = ce * weight * alpha reduce_fl = tf.keras.backend.max(fl, axis=-1) return reduce_fl

  • 自定义模型
def create_model(embedding_dims, max_len, num_class): tensor_input = tf.keras.Input(shape=(max_len, embedding_dims)) cnn1 = tf.keras.layers.SeparableConv1D(256, 3, padding='same', strides = 1, activation='relu')(tensor_input) cnn1 = tf.keras.layers.BatchNormalization()(cnn1) cnn1 = tf.keras.layers.MaxPool1D(pool_size=max_len)(cnn1) cnn2 = tf.keras.layers.SeparableConv1D(256, 4, padding='same', strides = 1, activation='relu')(tensor_input) cnn2 = tf.keras.layers.BatchNormalization()(cnn2) cnn2 = tf.keras.layers.MaxPool1D(pool_size=max_len)(cnn2) cnn3 = tf.keras.layers.SeparableConv1D(256, 5, padding='same', strides = 1, activation='relu')(tensor_input) cnn3 = tf.keras.layers.BatchNormalization()(cnn3) cnn3 = tf.keras.layers.MaxPool1D(pool_size=max_len)(cnn3) cnn = tf.keras.layers.concatenate([cnn1,cnn2,cnn3], axis=-1) x = tf.keras.layers.Dropout(0.2)(cnn) x = tf.keras.layers.Flatten()(x)x = tf.reshape(x,(-1,1,x.shape[-1])) x = tf.keras.layers.Dense(768, activation='relu')(x) x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(x) x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(x1) x1 = tf.keras.layers.Dense(32, activation='relu')(x1) x2 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True))(x) x2 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x2) x2 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32))(x2) x2 = tf.keras.layers.Dense(16, activation='relu')(x2) x = tf.keras.layers.Concatenate()([x1, x2]) x = tf.keras.layers.Dropout(0.2)(x) tensor_output = tf.keras.layers.Dense(num_class, activation='softmax')(x)model = tf.keras.models.Model(tensor_input, outputs=tensor_output) # model.summary() 查看模型 optimizer = tf.keras.optimizers.Nadam(learning_rate=1e-5) FL = lambda y_true,y_pred: Focal_Loss(y_true, y_pred, alpha=0.25, gamma=2) model.compile(loss=FL, optimizer=optimizer, metrics=['acc']) return model

  • 5折训练 + 预测取平均
gkf = StratifiedKFold(n_splits=5).split(X=train_df[input_categories].fillna('13'), y=train_df[output_categories].fillna('13'))test_preds = [] for fold, (train_idx, valid_idx) in enumerate(gkf): train_inputs = inputs[train_idx] train_outputs = to_categorical(outputs[train_idx])valid_inputs = inputs[valid_idx] valid_outputs = to_categorical(outputs[valid_idx])K.clear_session()# 销毁当前的TF图并创建一个新图,有助于避免旧模型/图层混乱。 # 模型构建 model = create_model(embedding_dims=120, max_len=MAX_SEQUENCE_LENGTH, num_class=14) model.fit(train_inputs, train_outputs, validation_data= https://www.it610.com/article/[valid_inputs, valid_outputs], epochs=7, batch_size=16) # 测试集预测 test_preds.append(model.predict(test_inputs)) # K折取平均 preds = np.average(test_preds, axis=0) preds = np.argmax(preds,axis=1)

输出上传文件
submission = pd.read_csv('../data/test_a_sample_submit.csv') submission['label'] = preds submission.to_csv('../output/Word2Vec_submission.csv', index=False)

    推荐阅读