隐马尔科夫实现中文分词（Python3实现） NLP

前言：隐马尔科夫的相关概念就不多说了，就是一个三元组（A,B,Pi），分别表示转移概率，发射概率和初始状态概率。
首先是语料库的训练部分：

#!/usr/bin/python #-*-coding:utf-8 import sys import math import pdb state_M = 4 word_N = 0A_dic = {} B_dic = {} Count_dic = {} Pi_dic = {} word_set = set() state_list = ['B','M','E','S'] line_num = -1# 语料库这里用的是人民日报已经人工分词的语料库 INPUT_DATA = "https://www.it610.com/article/RenMinData.txt" # 初始状态概率 PROB_START = "prob_start.py" # 发射状态概率 PROB_EMIT = "prob_emit.py" # 转移状态概率 PROB_TRANS = "prob_trans.py"def init(): global state_M global word_N for state in state_list: A_dic[state] = {} for state1 in state_list: A_dic[state][state1] = 0.0 for state in state_list: Pi_dic[state] = 0.0 B_dic[state] = {} Count_dic[state] = 0# 输入词语输出状态 B代表其实字，M代表中间字，E代表结束字，S代表单字成词 def getList(input_str): outpout_str = [] if len(input_str) == 1: outpout_str.append('S') elif len(input_str) == 2: outpout_str = ['B','E'] else: M_num = len(input_str) -2 M_list = ['M'] * M_num outpout_str.append('B') outpout_str.extend(M_list) outpout_str.append('S') return outpout_str# 输出模型的三个参数：初始概率+转移概率+发射概率 def Output(): start_fp = open(PROB_START,'w') emit_fp = open(PROB_EMIT,'w') trans_fp = open(PROB_TRANS,'w') print("11111111") print ("len(word_set) = %s " % (len(word_set)))for key in Pi_dic: Pi_dic[key] = Pi_dic[key] * 1.0 / line_num # start_fp.write(str(start_fp)+str(Pi_dic)) print(start_fp,Pi_dic)for key in A_dic: for key1 in A_dic[key]: A_dic[key][key1] = A_dic[key][key1] / Count_dic[key] # trans_fp.write(str(trans_fp)+str(A_dic)) print(trans_fp,A_dic)for key in B_dic: for word in B_dic[key]: B_dic[key][word] = B_dic[key][word] / Count_dic[key] emit_fp.write(str(emit_fp)+str(B_dic)) print(emit_fp,B_dic)start_fp.close() emit_fp.close() trans_fp.close()def main(): ifp = open(INPUT_DATA,encoding='utf-8') init() global word_set global line_num for line in ifp: line_num += 1 if line_num % 10000 == 0: print (line_num)line = line.strip() if not line: continue # line = line.decode("utf-8","ignore")word_list = [] for i in range(len(line)): if line[i] == " ":continue word_list.append(line[i]) word_set = word_set | set(word_list)lineArr = line.split(" ") line_state = [] for item in lineArr: # 一句话对应一行连续的状态 line_state.extend(getList(item))if len(word_list) != len(line_state): print (sys.stderr,"[line_num = %d][line = %s]" % (line_num, line.encode("utf-8",'ignore'))) else: for i in range(len(line_state)): if i == 0: # 计算转移概率 Pi_dic[line_state[0]] += 1 Count_dic[line_state[0]] += 1 else: # 计算发射概率 A_dic[line_state[i-1]][line_state[i]] += 1 Count_dic[line_state[i]] += 1 # if not B_dic[line_state[i]].has_key(word_list[i]): if word_list[i] not in B_dic[line_state[i]]: B_dic[line_state[i]][word_list[i]] = 0.0 else: B_dic[line_state[i]][word_list[i]] += 1 Output() ifp.close()if __name__ == "__main__": main()

然后在训练的基础上，进行测试，用的是维特比方法，该方法是用于降低计算复杂度而产生的，使用了动态规划的思想，笔记中有描述。

#!/usr/bin/python #-*-coding:utf-8 import os import sys import pdbdef load_model(f_name): ifp = open(f_name, 'rb') return eval(ifp.read())prob_start = load_model("prob_start.py") prob_trans = load_model("prob_trans.py") prob_emit = load_model("prob_emit.py")def viterbi(obs, states, start_p, trans_p, emit_p): V = [{}] #tabular path = {} for y in states: #init V[0][y] = start_p[y] * emit_p[y].get(obs[0],0) path[y] = [y] for t in range(1,len(obs)): V.append({}) newpath = {} for y in states: (prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in states if V[t-1][y0]>0]) V[t][y] =prob newpath[y] = path[state] + [y] path = newpath (prob, state) = max([(V[len(obs) - 1][y], y) for y in states]) return (prob, path[state])def cut(sentence): #pdb.set_trace() prob, pos_list =viterbi(sentence,('B','M','E','S'), prob_start, prob_trans, prob_emit) return (prob,pos_list)if __name__ == "__main__": test_str = u"长春市长春节讲话。" prob,pos_list = cut(test_str) print(test_str) print(pos_list) test_str = u"他说的确实在理." prob,pos_list = cut(test_str) print (test_str) print (pos_list)test_str = u"毛主席万岁。" prob,pos_list = cut(test_str) print (test_str) print (pos_list)test_str = u"我有一台电脑。" prob,pos_list = cut(test_str) print (test_str) print (pos_list)test_str = u"我不喜欢你。" prob, pos_list = cut(test_str) print(test_str) print(pos_list)

结果为： 【隐马尔科夫实现中文分词（Python3实现）】