NLP 中文分词-双向匹配算法(理论+Python实现)

【NLP 中文分词-双向匹配算法(理论+Python实现)】import time
import re
class Segment:

# 数据成员 sentence = "" MaxLen = 0 pos = 0 len = 0 result_MM = ""# 存放MM分词结果 result_RMM = ""# 存放RMM分词结果 final_res = "" dict = [] # 构造函数 def __init__(self, sentence, MaxLen): self.sentence = sentence self.MaxLen = MaxLen self.pos = 0 self.len = self.MaxLen self.result_MM = "" self.readDict() # 读字典 def readDict(self): f = open("chineseDic.txt", "r", encoding="utf-8") lines = f.readlines() for line in lines: # print(line) words = line.split(",") self.dict.append(words[0]) # 正向最大匹配 def MM(self, nLen, nPos): length = len(self.sentence) if (nPos > length): return substr = self.sentence[nPos:nPos + nLen] if substr in self.dict: self.result_MM = self.result_MM + substr + "/ " nPos = nPos + nLen nLen = self.MaxLen self.MM(nLen, nPos) elif nLen > 1: nLen = nLen - 1 self.MM(nLen, nPos) else: self.result_MM = self.result_MM + substr + "/ " nPos = nPos + 1 nLen = self.MaxLen self.MM(nLen, nPos) # 逆向最大匹配 def RMM(self, nLen, nPos): if (nPos < 0): return substr = self.sentence[nPos - nLen:nPos] if substr in self.dict: self.result_RMM = self.result_RMM + "/" + substr nPos = nPos - nLen nLen = self.MaxLen self.RMM(nLen, nPos) elif nLen > 1: nLen = nLen - 1 self.RMM(nLen, nPos) else: self.result_RMM = self.result_RMM + substr + "/" nPos = nPos - 1 nLen = self.MaxLen self.RMM(nLen, nPos) def getMMResult(self): return self.result_MM def getRMMResult(self): return self.result_RMM def getFinalResult(self): return self.final_res def printFinalResult(self): print("正向最大匹配结果:") seg_res_MM = self.result_MM.replace(" ", "") print(seg_res_MM) seg_list_MM = seg_res_MM.split('/') del seg_list_MM[-1]# 由于按照'/'分割,所以最后会多出一个'',删去 print(seg_list_MM) print("逆向最大匹配结果:") seg_res_RMM = self.result_RMM.replace(" ", "") print(seg_res_RMM) seg_list_RMM = list(reversed(seg_res_RMM.split('/'))) del seg_list_RMM[0] del seg_list_RMM[-1] print(seg_list_RMM) len_MM = len(seg_list_MM) len_RMM = len(seg_list_RMM) flag = 1 for i in range(0, min(len_MM, len_RMM)): if seg_list_MM[i] != seg_list_RMM[i]: print("两次分词结果不一致。") flag = 0 break if (flag): print("两次分词结果一致。") print("最终的分词结果为:") self.final_res = self.result_MM print(self.final_res)

def to_region(segmentation):
region = [] start = 1 for word in re.compile("\\s+").split(segmentation.strip()):# 空格,回车,换行等空白符 end = start + len(word) - 2 region.append((start, end)) start = end + 1 return region

def PRF(target, pred):
t_set, p_set = set(target), set(pred) target_num = len(t_set) pred_num = len(p_set) cap_num = len(t_set & p_set) p = cap_num / pred_num r = cap_num / target_num f = 2 * p * r / (p + r) print("P =", p) print("R =", r) print("F1 =", f)

if name == '__main__':
test_str = '在这一年中,中国的改革开放和现代化建设继续向前迈进。国民经济保持了“高增长、[利率期货](https://www.gendan5.com/ff/if.html)低通胀”的良好发展态势。农业生产再次获得好的收成,企业改革继续深化,人民生活进一步改善。对外经济技术合作与交流不断扩大。' seg = Segment(test_str, 3) time_start = time.time() seg.MM(3, 0) seg.RMM(3, len(test_str)) time_end = time.time() seg.printFinalResult() print('分词时间:', time_end - time_start, 's') target_str = "在/这/一/年/中/,/中国/的/改革/开放/和/现代化/建设/继续/向前/迈进/。/国民经济/保持/了/“/高/增长/、/低/通胀/”/的/良好/发展/态势/。/农业/生产/再次/获得/好/的/收成/,/企业/改革/继续/深化/,/人民/生活/进一步/改善/。/对外/经济/技术/合作/与/交流/不断/扩大/。/" re_pred = to_region(seg.getFinalResult()) re_target = to_region(target_str) # 每个单词按它在文本中的起止位置可记作区间[i, j] print("分词结果:", re_pred) print("标准答案:", re_target) PRF(re_target, re_pred)

    推荐阅读