自然语言处理|for albert tiny 训练最快速高精准度的语言模型albert工程化实现

首先感谢实在智能提供的ALbert tiny的预训练语言模型,本实验包含四种albert模型的表现,以及四种bert模型在分类任务中的表现。
brightmart/albert_zh
下面是roberta的语言模型。
brightmart/roberta_zh
之前的文章因为个人加了推广信息,所以被知乎封掉了。但是这确实是因为我知识分享的时候做了不应该做的事情,向知乎道歉。个人分享知识的初衷,是想通过中文的描述在中文中的表示去阐述各个模型的在任务中的表现。
任务包括:文本分类、命名实体识别、阅读理解、三元组抽取
也就是覆盖了比较主流的自然语言处理任务

www.52lm.xyz
算力由www.52lm.xyz提供两卡2080ti算力
本代码源码来源是苏剑林老师的bert4keras,我在源码的基础上进行了几处微调。

#! -*- coding:utf-8 -*- # 情感分析类似,加载albert_zh权重(https://github.com/brightmart/albert_zh)import json import osimport numpy as np import pandas as pd from keras.callbacks import ModelCheckpoint, EarlyStoppingfrom bert4keras.bert import load_pretrained_model, set_gelu from bert4keras.train import PiecewiseLinearLearningRate from bert4keras.utils import SimpleTokenizer, load_vocabset_gelu('tanh')# 切换gelu版本config_path = '../albert_tiny/albert_config_tiny.json' checkpoint_path = '../albert_tiny/albert_model.ckpt' dict_path = '../albert_tiny/vocab.txt' CONFIG = { 'max_len': 256, 'batch_size': 48, 'epochs': 32, 'use_multiprocessing': True, 'model_dir': os.path.join('../model_files/bert'), } max_len = CONFIG['max_len'] batch_size = CONFIG['batch_size'] train_message = pd.read_csv('../data/Train_Data.csv', header=None).values.tolist() chars = {}data = https://www.it610.com/article/[] # id,title,text,entity,negative,key_entityfor feather_data in train_message: if feather_data[2] is not None and feather_data[3] is not None: data.append((str(feather_data[2]) + str(feather_data[3]), feather_data[4])) for c in str(feather_data[2]) + str(feather_data[3]): if c is not None: if c in chars.keys(): chars[c] = chars[c] + 1 else: chars[c] = 1chars = {i: j for i, j in chars.items() if j>= 4}_token_dict = load_vocab(dict_path)# 读取词典 token_dict, keep_words = {}, []# keep_words是在bert中保留的字表for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c])for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c])tokenizer = SimpleTokenizer(token_dict)# 建立分词器if not os.path.exists('./random_order.json'): random_order = list(range(len(data))) np.random.shuffle(random_order) json.dump( random_order, open('./random_order.json', 'w'), indent=4 ) else: random_order = json.load(open('./random_order.json'))# 按照9:1的比例划分训练集和验证集 train_data = https://www.it610.com/article/[data[j] for i, j in enumerate(random_order) if i % 10 != 0] valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]def seq_padding(X, padding=0): L = [len(x) for x in X] ML = max(L) return np.array([ np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X ])class data_generator: def __init__(self, data, batch_size=batch_size): self.data = data self.batch_size = batch_size self.steps = len(self.data) // self.batch_size if len(self.data) % self.batch_size != 0: self.steps += 1def __len__(self): return self.stepsdef __iter__(self): while True: idxs = list(range(len(self.data))) np.random.shuffle(idxs) X1, X2, Y = [], [], [] for i in idxs: d = self.data[i] text = d[0][:max_len] x1, x2 = tokenizer.encode(first=text) y = d[1] X1.append(x1) X2.append(x2) Y.append([y]) if len(X1) == self.batch_size or i == idxs[-1]: X1 = seq_padding(X1) X2 = seq_padding(X2) Y = seq_padding(Y) yield [X1, X2], Y [X1, X2, Y] = [], [], []from keras.layers import * from keras.models import Model from keras.optimizers import Adammodel = load_pretrained_model( config_path, checkpoint_path, keep_words=keep_words,# 只保留keep_words中的字,精简原字表 albert=True )output = Lambda(lambda x: x[:, 0])(model.output) output = Dense(1, activation='sigmoid')(output) model = Model(model.input, output) save = ModelCheckpoint( os.path.join(CONFIG['model_dir'], 'bert.h5'), monitor='val_acc', verbose=1, save_best_only=True, mode='auto' ) early_stopping = EarlyStopping( monitor='val_acc', min_delta=0, patience=8, verbose=1, mode='auto' ) callbacks = [save, early_stopping]model.compile( loss='binary_crossentropy', optimizer=Adam(1e-5),# 用足够小的学习率 # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}), metrics=['accuracy'] ) model.summary()train_D = data_generator(train_data) valid_D = data_generator(valid_data)model.fit_generator( train_D.__iter__(), steps_per_epoch=len(train_D), epochs=10, validation_data=https://www.it610.com/article/valid_D.__iter__(), validation_steps=len(valid_D), callbacks=callbacks )def predict(model, test_data):""" 预测 :param test_data: :return: """ X1 = [] X2 = [] for s in test_data: x1, x2 = tokenizer.encode(first=s[:CONFIG['max_len']]) X1.append(x1) X2.append(x2) X1 = seq_padding(X1) X2 = seq_padding(X2) predict_results = model.predict([X1, X2]) return predict_resultstest_data = https://www.it610.com/article/pd.read_csv(os.path.join('data/Test_Data.csv'), encoding='utf-8') predict_test = [] for i in test_data['text']: if i is not None: predict_test.append(str(i)) predict_results = predict(model, predict_test) with open(os.path.join('data/bert/food-predict.csv'), 'w') as f: f.write("id,negative,key_entity\n") for i in range(test_data.shape[0]): label = 1 if predict_results[i][0] > 0.5 else 0 if label == 1: f.write(test_data.id[i] + ',' + str(label) + ',' + test_data.entity[i] + '\n') else: f.write(test_data.id[i] + ',' + str(label) + '\n')

【自然语言处理|for albert tiny 训练最快速高精准度的语言模型albert工程化实现】为了后续的对比实验,所以用了四卡的2080ti进行任务

    推荐阅读