机器学习|文本分类(Bert微调并完成下游任务)

1. 克隆Bert并获取预训练模型

$ git clone https://github.com/google-research/bert.git

【机器学习|文本分类(Bert微调并完成下游任务)】依赖和环境:
  • Tensorflow-gpu version 1.15 (不建议使用TF2)
  • Python version: 3.7
  • CUDA Version: 10.2
  • 预训练模型: https://github.com/google-research/bert
2. 改写自己的分类器读写函数 run_classifier.py
class MyProcessor(DataProcessor): """Processor for the my data set.""" def get_train_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")def get_test_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")def get_labels(self): """See base class.""" return ["0", "1"]def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(___) text_b = tokenization.convert_to_unicode(___) label = "0" else: text_a = tokenization.convert_to_unicode(___) text_b = tokenization.convert_to_unicode(___) label = tokenization.convert_to_unicode(___) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examplesdef main(_): tf.logging.set_verbosity(tf.logging.INFO)processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, "mypc": MyProcessor } ... ...

3. 传入需要的flag参数并微调模型
#!/bin/bashexport CURRENT_PATH=$(cd "$(dirname "$0")"; pwd) export BERT_BASE_DIR=$CURRENT_PATH/models/YOUR_MODEL_PATH export MY_DATASET=$CURRENT_PATH/YOUR_DATA_PATH/python run_classifier.py \ --task_name=mypc \ --do_train=true \ --do_eval=true \ --do_predict=true \ --data_dir=$MY_DATASET \ --vocab_file=$BERT_BASE_DIR/vocab.txt \ --bert_config_file=$BERT_BASE_DIR/bert_config.json \ --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ --max_seq_length=64 \ --train_batch_size=16 \ --learning_rate=5e-5 \ --num_train_epochs=3.0 \ --do_lower_case=False \# 多语言模型时 --output_dir=$CURRENT_PATH/YOUR_OUTPUT_PATH

    推荐阅读