NLP实践-Task1

对cnews数据做一些数据处理

import jieba import pandas as pd import tensorflow as tf from collections import Counter from gensim.models import Word2Vec from sklearn.feature_extraction.text import CountVectorizer# 读取停用词 def read_stopword(filename): stopword = [] fp = open(filename, 'r') for line in fp.readlines(): stopword.append(line.replace('\n', '')) fp.close() return stopword# 切分数据,并删除停用词 def cut_data(data, stopword): words = [] for content in data['content']: word = list(jieba.cut(content)) for w in list(set(word) & set(stopword)): while w in word: word.remove(w) words.append(word) data['content'] = words return data# 获取单词列表 def word_list(data): all_word = [] for word in data['content']: all_word.extend(word) return all_word# 提取特征 def feature(train_data, test_data, val_data): content = pd.concat([train_data['content'], test_data['content'], val_data['content']], ignore_index=True) # count_vec = CountVectorizer(max_features=300, min_df=2) # count_vec.fit_transform(content) # train_fea = count_vec.transform(train_data['content']).toarray() # test_fea = count_vec.transform(test_data['content']).toarray() # val_fea = count_vec.transform(val_data['content']).toarray() model = Word2Vec(content, size=100, min_count=1, window=10, iter=10) train_fea = train_data['content'].apply(lambda x: model[x]) test_fea = test_data['content'].apply(lambda x: model[x]) val_fea = val_data['content'].apply(lambda x: model[x]) return train_fea, test_fea, val_feaif __name__ == '__main__': train_data = https://www.it610.com/article/pd.read_csv('./data/task1/cnews/cnews.train.txt', names=['title', 'content'], sep='\t')# (50000, 2) test_data = https://www.it610.com/article/pd.read_csv('./data/task1/cnews/cnews.test.txt', names=['title', 'content'], sep='\t')# (10000, 2) val_data = https://www.it610.com/article/pd.read_csv('./data/task1/cnews/cnews.val.txt', names=['title', 'content'], sep='\t')# (5000, 2)train_data = https://www.it610.com/article/train_data.head(50) test_data = test_data.head(50) val_data = val_data.head(50)stopword = read_stopword('./data/stopword.txt') train_data = https://www.it610.com/article/cut_data(train_data, stopword) test_data = cut_data(test_data, stopword) val_data = cut_data(val_data, stopword)train_fea, test_fea, val_fea = feature(train_data, test_data, val_data) print(train_fea)all_word = [] all_word.extend(word_list(train_data)) all_word.extend(word_list(test_data)) all_word.extend(word_list(val_data)) all_word = list(set(all_word))

使用pytorch对cnews数据进行训练
import os import csv import jieba import numpy as np import pandas as pd from tqdm import tqdm from gensim.models import Word2Vec import torch import torch.nn as nn from torch.optim import Adam import torch.autograd as autogradclass_num = 10 batch_size = 256 maxlen = 100 word2vec_size = 100train_dir = './data/cnews/cnews.train.txt' valid_dir = './data/cnews/cnews.val.txt' test_dir = './data/cnews/cnews.test.txt' word2vec_dir = './word2vec/word2vec.hdf5' userdict_dir = './dict/userdict.txt' stopword_dir = './dict/stopword.txt'def cut_word(x, stop_word): words = [] for word in list(jieba.cut(x)): if word not in stop_word and len(word) != 1: words.append(word) return wordsdef get_word_vocab(content): word_vocb = [] for sentence in content: word_vocb.extend(list(set(sentence))) return list(set(word_vocb))def get_x(content, word_index): X = np.array((len(content), maxlen)) for i in range(len(content)): if len(content[i]) < maxlen: for j in range(0, len(content[i])): X[i][j] = word_index[content[i][j]] else: for j in range(0, maxlen): X[i][j] = word_index[content[i][j]] return Xdef get_label_vector(label): label_code = pd.get_dummies(list(set(label))) label_vector = dict() for col in label_code.columns: label_vector[col] = label_code[col].tolist() return label_vectorprint('read data') train = pd.read_csv(valid_dir, delimiter='\t', index_col=None, names=['label', 'content']) test = pd.read_csv(test_dir, delimiter='\t', index_col=None, names=['label', 'content'])print(train.shape) print(test.shape)print('cut word') jieba.load_userdict(userdict_dir) stop_word = pd.read_csv(stopword_dir, quoting=csv.QUOTE_NONE, index_col=None, names=['word'])['word'].tolist() train['content'] = train['content'].apply(lambda x: cut_word(x, stop_word)) test['content'] = test['content'].apply(lambda x: cut_word(x, stop_word)) content = pd.concat([train['content'], test['content']], axis=0, ignore_index=True)print('word vocab') word_vocab = get_word_vocab(content) word_index = dict(zip(word_vocab, range(1, len(word_vocab) + 1))) index_word = dict(zip(list(word_index.values()), list(word_index.keys())))print('word2vec') if not os.path.exists(word2vec_dir): model = Word2Vec(content, size=word2vec_size, seed=2019, min_count=5, window=10, iter=10, workers=8) model.save(word2vec_dir) else: model = Word2Vec.load(word2vec_dir)embedding_matrix = np.zeros((len(word_index) + 1, word2vec_size)) for word, i in word_index.items(): if word in model: embedding_matrix[i] = model[word]print('label') label_vector = get_label_vector(train['label']) y_train = train['label'].map(label_vector) y_test = test['label'].map(label_vector)class DataLoader(): def __init__(self, data, config, w2v_model): self.data = https://www.it610.com/article/data self.batch_size = config['batch_size'] self.maxlen = config['maxlen'] self.label_vector = config['label_vector'] self.word_index = config['word_index'] self.embedding = config['embedding'] self.w2v_model = w2v_modeldef data_to_matrix(self, content): X = np.array((len(content), self.maxlen)) for i in range(len(content)): if len(content[i]) < maxlen: for j in range(0, len(content[i])): X[i][j] = self.word_index[content[i][j]] else: for j in range(0, maxlen): X[i][j] = self.word_index[content[i][j]]def train_batch_data(data, batch_size, is_shuffle=True): if is_shuffle: data = https://www.it610.com/article/data.sample(frac=1).reset_index(drop=True)length = len(data) // batch_sizeif batch_size * length < len(data): length += 1for i in tqdm(range(length)): if batch_size * (i + 1)> len(data): batch_data = https://www.it610.com/article/data.loc[batch_size * i:, :] else: batch_data = data.loc[batch_size * i:batch_size * (i + 1) - 1, :]yield batch_datadef test_batch_data(data, batch_size): length = len(data) // batch_sizeif batch_size * length < len(data): length += 1for i in tqdm(range(length)): if batch_size * (i + 1)> len(data): batch_data = https://www.it610.com/article/data.loc[batch_size * i:, :] else: batch_data = data.loc[batch_size * i:batch_size * (i + 1) - 1, :] yield batch_dataclass textCNN(nn.Module): def __init__(self, config): super(textCNN, self).__init__() vocab_size = config['vocab_size'] embedding_dim = config['embedding_dim'] class_num = config['class_num'] embedding_matrix = config['embedding_matrix']self.embeding = nn.Embedding(vocab_size, embedding_dim, _weight=embedding_matrix) self.conv1 = nn.Sequential( nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5, stride=1, padding=2), nn.ReLU(), nn.MaxPool2d(kernel_size=2)) self.conv2 = nn.Sequential( nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=2), nn.ReLU(), nn.MaxPool2d(2)) self.conv3 = nn.Sequential( nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2), nn.ReLU(), nn.MaxPool2d(2)) self.conv4 = nn.Sequential( nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5, stride=1, padding=2), nn.ReLU(), nn.MaxPool2d(2)) self.out = nn.Linear(512, class_num)def forward(self, x): x = self.embeding(x) x = x.view(x.size(0), 1, maxlen, word2vec_size) x = self.conv1(x) x = self.conv2(x) x = self.conv3(x) x = self.conv4(x) x = x.view(x.size(0), -1)# 将(batch,outchanel,w,h)展平为(batch,outchanel*w*h) output = self.out(x) return outputconfig = dict() config['vocab_size'] = len(word_vocab) config['class_num'] = class_num config['batch_size'] = batch_size config['maxlen'] = maxlen config['label_vector'] = label_vector config['word_index'] = word_index config['learning_rate'] = 1e-3 config['embedding_dim'] = word2vec_size config['embedding_matrix'] = torch.Tensor(embedding_matrix)class Model(): def __init__(self, train_wide_deep_loader, valid_wide_deep_loader, test_wide_deep_loader, config): self.train_loader = train_wide_deep_loader self.valid_loader = valid_wide_deep_loader self.test_loader = test_wide_deep_loader self.model = textCNN(config=config) self.criterion = nn.CrossEntropyLoss() self.optimizer = Adam(self.model.parameters(), lr=config['learning_rate'])def verification(self): res = [] for query, title, fea, label in self.valid_loader.train_batch_data(is_shuffle=True): out = self.model(query, title, fea) res.extend([item.detach().numpy()[1] for item in list(out)])res = pd.DataFrame(res, columns=['pred']) valid_ans = pd.concat([self.valid_loader.data.loc[:, ['query_id', 'label']], res], axis=1)qauc = calculate_qauc(valid_ans) print('qauc is:') print(qauc) if qauc > self.mx_qauc: self.mx_qauc = qauc torch.save(self.model, './wnd/model/model.pkl')def fit(self, epoch): for i in range(epoch): for X_train in self.train_loader.train_batch_data(): out = self.model(query, title, fea)# 前向传播求出的预测值 self.optimizer.zero_grad()# 将梯度初始化为零 loss = self.criterion(out, autograd.Variable(label.long()))# 损失函数 loss.backward()# 反向传播求梯度 self.optimizer.step()# 更新所有参数self.verification()def restore(self): self.model = torch.load('./wnd/model/model.pkl')def predict(self): res = [] for query, title, fea in self.test_loader.test_batch_data(): out = self.model(query, title, fea) res.extend([item.detach().numpy()[1] for item in list(out)])res = pd.DataFrame(res, columns=['pred']) res.to_csv('./nn_res.csv', header=None, index=None, sep=',')model = Model(train_loader, valid_loader, test_loader, config) model.fit(1) # model = Model(train_loader, valid_loader, test_loader, config) # model.restore() model.predict()

【NLP实践-Task1】评价指标:https://blog.csdn.net/zh11403070219/article/details/82026338

    推荐阅读