对cnews数据做一些数据处理
import jieba
import pandas as pd
import tensorflow as tf
from collections import Counter
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer# 读取停用词
def read_stopword(filename):
stopword = []
fp = open(filename, 'r')
for line in fp.readlines():
stopword.append(line.replace('\n', ''))
fp.close()
return stopword# 切分数据,并删除停用词
def cut_data(data, stopword):
words = []
for content in data['content']:
word = list(jieba.cut(content))
for w in list(set(word) & set(stopword)):
while w in word:
word.remove(w)
words.append(word)
data['content'] = words
return data# 获取单词列表
def word_list(data):
all_word = []
for word in data['content']:
all_word.extend(word)
return all_word# 提取特征
def feature(train_data, test_data, val_data):
content = pd.concat([train_data['content'], test_data['content'], val_data['content']], ignore_index=True)
# count_vec = CountVectorizer(max_features=300, min_df=2)
# count_vec.fit_transform(content)
# train_fea = count_vec.transform(train_data['content']).toarray()
# test_fea = count_vec.transform(test_data['content']).toarray()
# val_fea = count_vec.transform(val_data['content']).toarray()
model = Word2Vec(content, size=100, min_count=1, window=10, iter=10)
train_fea = train_data['content'].apply(lambda x: model[x])
test_fea = test_data['content'].apply(lambda x: model[x])
val_fea = val_data['content'].apply(lambda x: model[x])
return train_fea, test_fea, val_feaif __name__ == '__main__':
train_data = https://www.it610.com/article/pd.read_csv('./data/task1/cnews/cnews.train.txt', names=['title', 'content'], sep='\t')# (50000, 2)
test_data = https://www.it610.com/article/pd.read_csv('./data/task1/cnews/cnews.test.txt', names=['title', 'content'], sep='\t')# (10000, 2)
val_data = https://www.it610.com/article/pd.read_csv('./data/task1/cnews/cnews.val.txt', names=['title', 'content'], sep='\t')# (5000, 2)train_data = https://www.it610.com/article/train_data.head(50)
test_data = test_data.head(50)
val_data = val_data.head(50)stopword = read_stopword('./data/stopword.txt')
train_data = https://www.it610.com/article/cut_data(train_data, stopword)
test_data = cut_data(test_data, stopword)
val_data = cut_data(val_data, stopword)train_fea, test_fea, val_fea = feature(train_data, test_data, val_data)
print(train_fea)all_word = []
all_word.extend(word_list(train_data))
all_word.extend(word_list(test_data))
all_word.extend(word_list(val_data))
all_word = list(set(all_word))
使用pytorch对cnews数据进行训练
import os
import csv
import jieba
import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim.models import Word2Vec
import torch
import torch.nn as nn
from torch.optim import Adam
import torch.autograd as autogradclass_num = 10
batch_size = 256
maxlen = 100
word2vec_size = 100train_dir = './data/cnews/cnews.train.txt'
valid_dir = './data/cnews/cnews.val.txt'
test_dir = './data/cnews/cnews.test.txt'
word2vec_dir = './word2vec/word2vec.hdf5'
userdict_dir = './dict/userdict.txt'
stopword_dir = './dict/stopword.txt'def cut_word(x, stop_word):
words = []
for word in list(jieba.cut(x)):
if word not in stop_word and len(word) != 1:
words.append(word)
return wordsdef get_word_vocab(content):
word_vocb = []
for sentence in content:
word_vocb.extend(list(set(sentence)))
return list(set(word_vocb))def get_x(content, word_index):
X = np.array((len(content), maxlen))
for i in range(len(content)):
if len(content[i]) < maxlen:
for j in range(0, len(content[i])):
X[i][j] = word_index[content[i][j]]
else:
for j in range(0, maxlen):
X[i][j] = word_index[content[i][j]]
return Xdef get_label_vector(label):
label_code = pd.get_dummies(list(set(label)))
label_vector = dict()
for col in label_code.columns:
label_vector[col] = label_code[col].tolist()
return label_vectorprint('read data')
train = pd.read_csv(valid_dir, delimiter='\t', index_col=None, names=['label', 'content'])
test = pd.read_csv(test_dir, delimiter='\t', index_col=None, names=['label', 'content'])print(train.shape)
print(test.shape)print('cut word')
jieba.load_userdict(userdict_dir)
stop_word = pd.read_csv(stopword_dir, quoting=csv.QUOTE_NONE, index_col=None, names=['word'])['word'].tolist()
train['content'] = train['content'].apply(lambda x: cut_word(x, stop_word))
test['content'] = test['content'].apply(lambda x: cut_word(x, stop_word))
content = pd.concat([train['content'], test['content']], axis=0, ignore_index=True)print('word vocab')
word_vocab = get_word_vocab(content)
word_index = dict(zip(word_vocab, range(1, len(word_vocab) + 1)))
index_word = dict(zip(list(word_index.values()), list(word_index.keys())))print('word2vec')
if not os.path.exists(word2vec_dir):
model = Word2Vec(content, size=word2vec_size, seed=2019, min_count=5, window=10, iter=10, workers=8)
model.save(word2vec_dir)
else:
model = Word2Vec.load(word2vec_dir)embedding_matrix = np.zeros((len(word_index) + 1, word2vec_size))
for word, i in word_index.items():
if word in model:
embedding_matrix[i] = model[word]print('label')
label_vector = get_label_vector(train['label'])
y_train = train['label'].map(label_vector)
y_test = test['label'].map(label_vector)class DataLoader():
def __init__(self, data, config, w2v_model):
self.data = https://www.it610.com/article/data
self.batch_size = config['batch_size']
self.maxlen = config['maxlen']
self.label_vector = config['label_vector']
self.word_index = config['word_index']
self.embedding = config['embedding']
self.w2v_model = w2v_modeldef data_to_matrix(self, content):
X = np.array((len(content), self.maxlen))
for i in range(len(content)):
if len(content[i]) < maxlen:
for j in range(0, len(content[i])):
X[i][j] = self.word_index[content[i][j]]
else:
for j in range(0, maxlen):
X[i][j] = self.word_index[content[i][j]]def train_batch_data(data, batch_size, is_shuffle=True):
if is_shuffle:
data = https://www.it610.com/article/data.sample(frac=1).reset_index(drop=True)length = len(data) // batch_sizeif batch_size * length < len(data):
length += 1for i in tqdm(range(length)):
if batch_size * (i + 1)> len(data):
batch_data = https://www.it610.com/article/data.loc[batch_size * i:, :]
else:
batch_data = data.loc[batch_size * i:batch_size * (i + 1) - 1, :]yield batch_datadef test_batch_data(data, batch_size):
length = len(data) // batch_sizeif batch_size * length < len(data):
length += 1for i in tqdm(range(length)):
if batch_size * (i + 1)> len(data):
batch_data = https://www.it610.com/article/data.loc[batch_size * i:, :]
else:
batch_data = data.loc[batch_size * i:batch_size * (i + 1) - 1, :]
yield batch_dataclass textCNN(nn.Module):
def __init__(self, config):
super(textCNN, self).__init__()
vocab_size = config['vocab_size']
embedding_dim = config['embedding_dim']
class_num = config['class_num']
embedding_matrix = config['embedding_matrix']self.embeding = nn.Embedding(vocab_size, embedding_dim, _weight=embedding_matrix)
self.conv1 = nn.Sequential(
nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5, stride=1, padding=2),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2))
self.conv2 = nn.Sequential(
nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=2),
nn.ReLU(),
nn.MaxPool2d(2))
self.conv3 = nn.Sequential(
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2),
nn.ReLU(),
nn.MaxPool2d(2))
self.conv4 = nn.Sequential(
nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5, stride=1, padding=2),
nn.ReLU(),
nn.MaxPool2d(2))
self.out = nn.Linear(512, class_num)def forward(self, x):
x = self.embeding(x)
x = x.view(x.size(0), 1, maxlen, word2vec_size)
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)
x = self.conv4(x)
x = x.view(x.size(0), -1)# 将(batch,outchanel,w,h)展平为(batch,outchanel*w*h)
output = self.out(x)
return outputconfig = dict()
config['vocab_size'] = len(word_vocab)
config['class_num'] = class_num
config['batch_size'] = batch_size
config['maxlen'] = maxlen
config['label_vector'] = label_vector
config['word_index'] = word_index
config['learning_rate'] = 1e-3
config['embedding_dim'] = word2vec_size
config['embedding_matrix'] = torch.Tensor(embedding_matrix)class Model():
def __init__(self, train_wide_deep_loader, valid_wide_deep_loader, test_wide_deep_loader, config):
self.train_loader = train_wide_deep_loader
self.valid_loader = valid_wide_deep_loader
self.test_loader = test_wide_deep_loader
self.model = textCNN(config=config)
self.criterion = nn.CrossEntropyLoss()
self.optimizer = Adam(self.model.parameters(), lr=config['learning_rate'])def verification(self):
res = []
for query, title, fea, label in self.valid_loader.train_batch_data(is_shuffle=True):
out = self.model(query, title, fea)
res.extend([item.detach().numpy()[1] for item in list(out)])res = pd.DataFrame(res, columns=['pred'])
valid_ans = pd.concat([self.valid_loader.data.loc[:, ['query_id', 'label']], res], axis=1)qauc = calculate_qauc(valid_ans)
print('qauc is:')
print(qauc)
if qauc > self.mx_qauc:
self.mx_qauc = qauc
torch.save(self.model, './wnd/model/model.pkl')def fit(self, epoch):
for i in range(epoch):
for X_train in self.train_loader.train_batch_data():
out = self.model(query, title, fea)# 前向传播求出的预测值
self.optimizer.zero_grad()# 将梯度初始化为零
loss = self.criterion(out, autograd.Variable(label.long()))# 损失函数
loss.backward()# 反向传播求梯度
self.optimizer.step()# 更新所有参数self.verification()def restore(self):
self.model = torch.load('./wnd/model/model.pkl')def predict(self):
res = []
for query, title, fea in self.test_loader.test_batch_data():
out = self.model(query, title, fea)
res.extend([item.detach().numpy()[1] for item in list(out)])res = pd.DataFrame(res, columns=['pred'])
res.to_csv('./nn_res.csv', header=None, index=None, sep=',')model = Model(train_loader, valid_loader, test_loader, config)
model.fit(1)
# model = Model(train_loader, valid_loader, test_loader, config)
# model.restore()
model.predict()
【NLP实践-Task1】评价指标:https://blog.csdn.net/zh11403070219/article/details/82026338
推荐阅读
- 人工智能|hugginface-introduction 案例介绍
- 中文分词预处理之N最短路径法小结(转)
- 深度学习|2019年CS224N课程笔记-Lecture 17:Multitask Learning
- 深度学习|2018年度总结和2019年度计划
- BERT微调做中文文本分类
- 【学习笔记】自然语言处理实践(新闻文本分类)- 基于深度学习的文本分类Bert
- 【学习笔记】自然语言处理实践(新闻文本分类)- 基于深度学习的文本分类Word2Vec
- 自然语言处理|答案选择|语义匹配任务目前表现最好的几个模型
- 深度学习|NLP重铸篇之BERT如何微调文本分类