【kaggle|word2vec和常见CNN+RNN网格结构组成的文本分类模型】作者为了应付毕业,所以在补充深度学习相关知识,这是我尝试把word2vec和深度学习相互结合的一次记录。
- 数据集来源
- 数据集预处理
- 生成word2vec模型
- 搭建网络并且训练
数据集来源
本文的数据集源自kaggle比赛中的NLP入门比赛,灾难新闻预报警。
数据集预处理
数据导入:
import numpy as np
import pandas as pd
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
数据预处理:
import re
import os, sys
import string
# 停用词
from nltk.corpus import stopwords
# 小写
def text_to_lowercase(text):
return text.lower()
#去掉标点符号
def text_remove_punctuation(text):
return text.translate(str.maketrans('', '', string.punctuation))
#去掉url
def text_remove_url(text):
return re.sub(r"http\S+", "", text)
#去掉@符号
def text_remove_twitter_handle(text):
return re.sub('@[^\s]+','',text)
#去掉Python strip() 方法用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列。
def text_remove_leadtrail_spaces(text):
return text.strip()
def clean_text(text):
# order matters
text1 = text_remove_twitter_handle(text)
text2 = text_remove_url(text1)
text3 = text_remove_punctuation(text2)
text4 = text_to_lowercase(text3)
text5 = text_remove_leadtrail_spaces(text4)
return text5
# x = train_df["text"]
# 类似于list遍历性操作
# y = [clean_text(i) for i in x]
# text processing
train_df['text_processed'] =[clean_text(i) for i in train_df["text"]]
# x1 = test_df["text"]
# y1 = [clean_text(i) for i in x1]
# text processing
#清洗数据
test_df['text_processed'] =[clean_text(i) for i in test_df["text"]]
feature=train_df['text_processed']
target=train_df['target']
生成word2vec模型
from gensim.models import Word2Vec
# 训练模型,词向量的长度设置为500# , 迭代次数为8# ,采用skip-gram模型# ,采用负采样# 窗口选择6# 最小词频是7# ,模型保存为pkl格式
w2v_model=Word2Vec(feature, size=500, sg=1,hs=0,window=6, iter=8,min_count=7)
w2v_model.wv.save_word2vec_format("./word2Vec" + ".pkl", binary=True)
导入工具包
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras.layers.merge import concatenate
# 搭建模型
from keras.models import Sequential, Model
# 这个是层的搭建
from keras.layers import Dense, Embedding, Activation, Input
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D
from keras.layers importBatchNormalization
from keras.layers import Convolution1D, Conv1D,MaxPooling1D
from keras.layers import Dense, Embedding, Input, Lambda, Reshape
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D
from keras.layers import LSTM, GRU, TimeDistributed, Bidirectional
rom keras.utils import to_categorical
数据集分割和转数字
# 文本标签分类数量
NUM_CLASS=2
# 输入维度
INPUT_SIZE=64
# # 序列对齐文本数据
# Tokenizer是一个用于向量化文本,或将文本转换为序列
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;
<=>?@[\\]^_`{|}~\t\n',lower=True,split=" ")
tokenizer.fit_on_texts(feature)
vocab = tokenizer.word_index
x_ids=tokenizer.texts_to_sequences(feature)
pad_s=pad_sequences(x_ids, maxlen=INPUT_SIZE)
from keras.utils import to_categorical
target_u=to_categorical(target,NUM_CLASS)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(pad_s,target_u,random_state=22,test_size=0.2)
搭建网络并且训练
给Embedding加入word2vec
embeding_matrix=np.zeros((len(vocab)+1,500))
for word,i in vocab.items():
try:
embeding_vector=w2v_model[str(word)]
embeding_matrix[i]=embeding_vector
except KeyError:
continue
textCNN模型加入word2vec
from keras.layers import Flatten,Dropout
main_input=Input(shape=(INPUT_SIZE,),dtype='float64')
embedder=Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True)
embed=embedder(main_input)
cnn1=Conv1D(256,3,padding='same',strides=1,activation='relu')(embed)
cnn1=MaxPooling1D(pool_size=38)(cnn1)
cnn2=Conv1D(256,4,padding='same',strides=1,activation='relu')(embed)
cnn2=MaxPooling1D(pool_size=37)(cnn2)
cnn3=Conv1D(256,5,padding='same',strides=1,activation='relu')(embed)
cnn3=MaxPooling1D(pool_size=36)(cnn3)
cnn=concatenate([cnn1,cnn2,cnn3],axis=-1)
flat=Flatten()(cnn)
drop=Dropout(0.2)(flat)
main_output=Dense(NUM_CLASS,activation='softmax')(drop)
model=Model(inputs=main_input,outputs=main_output)
model.summary()
模型搭建结果:
文章图片
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
batch_size=32,
epochs=10,
validation_data=https://www.it610.com/article/[X_test,y_test])
模型训练结果:
文章图片
其他的模型
加了word2vec的CNN模型
model = Sequential()
model.add(Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True)) #使用Embeeding层将每个词编码转换为词向量
model.add(Conv1D(256, 5, padding='same'))
model.add(MaxPooling1D(3, 3, padding='same'))
model.add(Conv1D(128, 5, padding='same'))
model.add(MaxPooling1D(3, 3, padding='same'))
model.add(Conv1D(64, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.1))
model.add(BatchNormalization())# (批)规范化层
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(NUM_CLASS, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
batch_size=32,
epochs=10,
validation_data=https://www.it610.com/article/[X_test,y_test])
加入了word2vec的RNN模型
model = Sequential()
model.add(Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True))
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.1))
model.add(Dense(NUM_CLASS, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
batch_size=32,
epochs=10,
validation_data=https://www.it610.com/article/[X_test,y_test])
加入了word2vec的Bi-GRU
# 模型结构:词嵌入-双向GRU*2-全连接
model = Sequential()
# 64是序列号
model.add(Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True))
model.add(Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1, return_sequences=True)))
model.add(Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1)))
model.add(Dense(NUM_CLASS, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
batch_size=32,
epochs=10,
validation_data=https://www.it610.com/article/[X_test,y_test])
加入了word2vec的CNN+RNN 串联
# 模型结构:词嵌入-卷积池化-GRU*2-全连接
model = Sequential()
model.add(Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True))
model.add(Convolution1D(256, 3, padding='same', strides = 1))
model.add(Activation('relu'))
model.add(MaxPool1D(pool_size=2))
model.add(GRU(256, dropout=0.2, recurrent_dropout=0.1, return_sequences = True))
model.add(GRU(256, dropout=0.2, recurrent_dropout=0.1))
model.add(Dense(NUM_CLASS, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
batch_size=32,
epochs=10,
validation_data=https://www.it610.com/article/[X_test,y_test])
加入了word2vec的 CNN+RNN 并联
# 模型结构:词嵌入-卷积池化-全连接 ---拼接-全连接
#-双向GRU-全连接
main_input = Input(shape=(INPUT_SIZE,), dtype='float64')
embed = Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True)(main_input)
cnn = Convolution1D(256, 3, padding='same', strides = 1, activation='relu')(embed)
cnn = MaxPool1D(pool_size=4)(cnn)
cnn = Flatten()(cnn)
cnn = Dense(256)(cnn)
rnn = Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1))(embed)
rnn = Dense(256)(rnn)
con = concatenate([cnn,rnn], axis=-1)
main_output = Dense(NUM_CLASS, activation='softmax')(con)
model = Model(inputs = main_input, outputs = main_output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
batch_size=32,
epochs=10,
validation_data=https://www.it610.com/article/[X_test,y_test])
推荐阅读
- 深度学习与神经网络|第3周 用1层隐藏层的神经网络分类二维数据
- 深度学习|基于tensorflow2+textCNN的中文垃圾邮件分类
- 深度学习|基于keras深度学习模型新闻标签一二级分类
- 人工智能|tensorflow搭建CNN对文本进行分类
- 自然语言处理|人工智能框架实战精讲(Keras项目-英文语料的文本分类实战与调参优化)
- 实战|基于Keras搭建LSTM网络实现文本情感分类
- 深度学习|一文看尽 6篇 CVPR2021 2D 异常检测论文
- 论文|论文分享-- >异常检测-- >Deep Autoencoding Gaussian Mixture Model for Unsupervised Anomaly Detection
- 深度学习|超大规模的产业实用语义分割数据集PSSL与预训练模型开源啦!