123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414 |
- # encoding:utf-8
- '''
- Created on 2016年9月7日
- @author: liuyu
- Last updated: 2018年2月25日
- @editor: yaleimeng
- @主要改动:
- 1,修复了读取词库遗漏的bug,不重复词从53251增加到53336个
- 2,对于不够Pythonic的代码按python风格改写,
- 3,按Python风格改写后,省去了几个多余函数。可读性更好
- '''
- import math
- import os
- def parseZhAndEn(text):
- words = text.split('|')
- if len(words) == 2:
- return words[1], words[0]
- else:
- return text, text
- class GlossaryElement:
- ''' #词汇表条目 '''
- def __init__(self):
- self.word = '' # 词
- self.type = '' # 词性
- self.solid = False # 实词/虚词
- self.s_first = '' # 第一基本义原
- self.s_other = [] # 其他义原
- self.s_relation = {} # 关系义原
- self.s_symbol = {} # 符号义原
- def dump(self):
- print(self.word + ',' + self.type + ', | first:' + self.s_first + ' | other:')
- for i in range(len(self.s_other)):
- print(self.s_other[i] + ',')
- print(' | relation:')
- for it in self.s_relation.keys():
- print(it + '=' + self.s_relation[it] + ',')
- print(' | symbol:')
- for it in self.s_symbol.keys():
- print(it + '=' + self.s_symbol[it] + ',')
- print('\n')
- def parse(self, text):
- line = text
- if not line.strip(): # 如果本行为空,则返回False。不为空,则进行解析
- return False
- items = line.split('/')
- if len(items) == 3:
- self.word = items[0]
- self.type = items[1]
- if line[0] != '{':
- self.solid = True
- else:
- self.solid = False
- line = line[1:len(line) - 2]
- sememes = items[2].split(',')
- if len(sememes) > 0:
- firstdone = False
- if sememes[0][0].isalpha():
- self.s_first, defaultText = parseZhAndEn(sememes[0])
- firstdone = True
- for i in range(len(sememes)):
- if 0 == i and firstdone:
- continue
- firstletter = sememes[i][0]
- if '(' == firstletter:
- self.s_other.append(sememes[i])
- continue
- equalpos = sememes[i].find('=')
- if equalpos != -1:
- key = sememes[i][0:equalpos]
- value = sememes[i][equalpos + 1]
- if len(value) > 0 and value[0] != '(':
- value, defaultText = parseZhAndEn(value)
- self.s_relation[key] = value
- continue
- if firstletter.isalpha() == False:
- value = sememes[i][1:]
- if len(value) > 0 and value[0] != '(':
- value, defaultText = parseZhAndEn(value)
- self.s_symbol[firstletter] = value
- continue
- self.s_other.append(sememes[i])
- # self.dump()
- return True
- return False
- class SememeElement:
- ''' 义原条目 '''
- def __init__(self):
- self.id = -1 # 编号
- self.father = -1 # 英文义原
- self.sememe_zh = '' # 中文义原
- self.sememe_en = '' # 父义原编号
- def parse(self, line):
- if not line: # 如果当前行为空,不解析,返回False
- return False
- items = line.split()
- if len(items) == 3:
- self.id = items[0]
- self.father = items[2]
- self.sememe_zh, self.sememe_en = parseZhAndEn(items[1])
- return True
- return False
- def valuesOfGlossarytable_(glossarytable_, word):
- values_ = []
- for key_, v_ in glossarytable_.items():
- key_ = key_.split('\t')[1]
- if key_ == word:
- values_.append(v_)
- return values_
- class How_Similarity:
- def __init__(self):
- self.sememetable_ = dict() # 义原表
- self.sememeindex_zn_ = dict() # 义原索引(中文)
- self.glossarytable_ = dict() # 词汇表。
- self.glossaryfile = os.path.join(os.path.dirname(__file__), 'glossary.txt')
- self.sememefile = os.path.join(os.path.dirname(__file__), 'whole.dat')
- self.vocab = set()
- self.BETA = [0.5, 0.2, 0.17, 0.13]
- self.GAMA = 0.2
- self.DELTA = 0.2
- self.ALFA = 1.6
- self.init()
- def init(self):
- ''' 初始化义原和词汇表 '''
- if self.loadSememeTable(self.sememefile) == False:
- print("[ERROR] %s 加载失败.", self.sememefile)
- return False
- if self.loadGlossary(self.glossaryfile) == False:
- print("[ERROR] %s 加载失败.", self.glossaryfile)
- return False
- return True
- def loadSememeTable(self, filename):
- with open(filename, 'rt', encoding='utf-8') as reader:
- try:
- lines = reader.readlines()
- for line in lines:
- if line.strip(): # 如果当前行不为空
- ele = SememeElement()
- if ele.parse(line):
- self.sememetable_[ele.id] = ele
- self.sememeindex_zn_[ele.sememe_zh] = ele
- except Exception as e:
- print('function loadSememeTable has Errors!!')
- print(e)
- return False
- return True
- def loadGlossary(self, filename):
- ''' 加载词汇表 '''
- with open(filename, 'r', encoding='utf-8') as reader:
- try:
- lines = reader.readlines()
- if not lines: # 从 lines = [] 改为 not,更pythonic
- return False
- count = 0
- for line in lines:
- if not line.strip(): # empty函数 == False 改得更pythonic。
- continue # 使用continue,减小嵌套深度
- ele = GlossaryElement()
- if ele.parse(line):
- self.glossarytable_[str(count) + '\t' + ele.word] = ele
- self.vocab.add(ele.word)
- count += 1
- # print('function loadGlossary has been completed!!')
- except Exception as e:
- print('function loadGlossary has errors!!', e)
- return False
- return True
- def getSememeByID(self, id_):
- ''' 根据编号获取义原 '''
- if id_ in self.sememetable_.keys():
- return self.sememetable_[id_]
- return None
- def getSememeByZh(self, word):
- ''' 根据汉词获取义原 '''
- if word in self.sememeindex_zn_.keys():
- return self.sememeindex_zn_[word]
- return None
- def getGlossary(self, word):
- ''' 获取词汇表中的词 '''
- if word in self.vocab:
- return valuesOfGlossarytable_(self.glossarytable_, word)
- return None
- def calcGlossarySim(self, w1, w2):
- ''' 计算词汇表中两个词的相似度 '''
- if w1 == None or w2 == None: return 0.0
- if w1.solid != w2.solid: return 0.0
- sim1 = self.calcSememeSimFirst(w1, w2)
- sim2 = self.calcSememeSimOther(w1, w2)
- sim3 = self.calcSememeSimRelation(w1, w2)
- sim4 = self.calcSememeSimSymbol(w1, w2)
- sim = self.BETA[0] * sim1 + self.BETA[1] * sim1 * sim2 + self.BETA[2] * sim1 * sim2 * sim3 + (
- self.BETA[3] * sim1 * sim2 * sim3 * sim4)
- return sim
- def calcSememeSim(self, w1, w2, ):
- ''' 计算两个义原之间的相似度 '''
- if not w1 and not w2:
- return 1.0
- if not w1 or not w2:
- return self.DELTA
- if w1 == w2:
- return 1.0
- d = self.calcSememeDistance(w1, w2) # 这里是刘群的相似度计算公式。
- if d >= 0:
- return self.ALFA / (self.ALFA + d)
- else:
- return -1.0
- def weight(self, i):
- left = 1 - i / 13
- PI = 3.1415926536
- right = 1 + math.sin(i * PI / 45)
- return left * right
- def calcSememeDistance(self, w1, w2): # 论文修改了距离函数。
- '''
- 计算义原之间的距离(义原树中两个节点之间的距离)
- '''
- s1 = self.getSememeByZh(w1)
- s2 = self.getSememeByZh(w2)
- if s1 == None or s2 == None:
- return -1.0
- fatherpath = []
- id1, id2 = s1.id, s2.id
- father1, father2 = s1.father, s2.father
- while (id1 != father1): # 追溯 s1的上层词。
- fatherpath.append(id1)
- id1 = father1
- father_ = self.getSememeByID(father1)
- if father_:
- father1 = father_.father
- fatherpath.append(id1)
- len_ = 0.0
- while (id2 != father2):
- if id2 in fatherpath:
- father_pos = fatherpath.index(id2)
- return self.weight(father_pos) + len_
- id2 = father2
- father_ = self.getSememeByID(father2)
- if father_:
- father2 = father_.father
- len_ = len_ + self.weight(1)
- if id2 in fatherpath:
- father_pos = fatherpath.index(id2)
- return self.weight(father_pos) + len_
- return 20.0
- def calcSememeSimFirst(self, w1, w2):
- ''' 计算第一基本义原之间的相似度 '''
- return self.calcSememeSim(w1.s_first, w2.s_first)
- def calcSememeSimOther(self, w1, w2):
- ''' 计算其他义原之间的相似度 '''
- if w1.s_other == [] and w2.s_other == []:
- return 1.0
- sum_ = 0.0
- for i in range(len(w1.s_other)):
- maxTemp = -1.0
- for j in range(len(w2.s_other)):
- temp = 0.0
- if w1.s_other[i][0] != '(' and w2.s_other[j][0] != '(':
- temp = self.calcSememeSim(w1.s_other[i], w2.s_other[j])
- elif w1.s_other[i][0] == '(' and w2.s_other[j][0] == '(':
- if w1.s_other[i] == w2.s_other[j]:
- temp = 1.0
- else:
- maxTemp = 0.0
- else:
- temp = self.GAMA
- if temp > maxTemp:
- maxTemp = temp
- if maxTemp == -1.0: # there is no element in w2.s_other
- maxTemp = self.DELTA
- sum_ = sum_ + maxTemp
- if len(w1.s_other) < len(w2.s_other):
- sum_ = sum_ + (len(w2.s_other) - len(w1.s_other)) * self.DELTA
- return sum_ / max(len(w1.s_other), len(w2.s_other))
- def calcSememeSimRelation(self, w1, w2):
- ''' 计算关系义原之间的相似度 '''
- if w1.s_relation == {} and w2.s_relation == {}:
- return 1.0
- sum_ = 0.0
- for it1 in w1.s_relation.keys():
- maxTemp = 0.0
- temp = 0.0
- if it1 in w2.s_relation.keys():
- if w1.s_relation[it1][0] != '(' and w2.s_relation[it1][0] != '(':
- temp = self.calcSememeSim(w1.s_relation[it1], w2.s_relation[it1])
- elif w1.s_relation[it1][0] == '(' and w2.s_relation[it1][0] == '(':
- if w1.s_relation[it1] == w2.s_relation[it1]:
- temp = 1.0
- else:
- maxTemp = 0.0
- else:
- temp = self.GAMA
- else:
- maxTemp = self.DELTA
- if temp > maxTemp:
- maxTemp = temp
- sum_ = sum_ + maxTemp
- if len(w1.s_relation) < len(w2.s_relation):
- sum_ = sum_ + (len(w2.s_relation) - len(w1.s_relation)) * self.DELTA
- return sum_ / max(len(w1.s_relation), len(w2.s_relation))
- def calcSememeSimSymbol(self, w1, w2):
- ''' 计算符号义原之间的相似度 '''
- if w1.s_symbol == {} and w2.s_symbol == {}:
- return 1.0
- sum_ = 0.0
- for it1 in w1.s_symbol.keys():
- maxTemp = 0.0
- temp = 0.0
- if it1 in w2.s_symbol.keys():
- if w1.s_symbol[it1][0] != '(' and w2.s_symbol[it1][0] != '(':
- temp = self.calcSememeSim(w1.s_symbol[it1], w2.s_symbol[it1])
- elif w1.s_symbol[it1][0] == '(' and w2.s_symbol[it1][0] == '(':
- if w1.s_symbol[it1] == w2.s_symbol[it1]:
- temp = 1.0
- else:
- maxTemp = 0.0
- else:
- temp = self.GAMA
- else:
- maxTemp = self.DELTA
- if temp > maxTemp:
- maxTemp = temp
- sum_ = sum_ + maxTemp
- if len(w1.s_symbol) < len(w2.s_symbol):
- sum_ = sum_ + (len(w2.s_symbol) - len(w1.s_symbol)) * self.DELTA
- return sum_ / max(len(w1.s_symbol), len(w2.s_symbol))
- def calc(self, w1, w2, ):
- '''
- 计算两个词的语义相似度(返回值: [0, 1], -2:指定的词词典中不存在)
- '''
- if w1 == w2:
- return 1
- sw1 = self.getGlossary(w1) # 获取词表。
- sw2 = self.getGlossary(w2)
- if sw1 == None or sw2 == None or len(sw1) <= 0 or len(sw2) <= 0:
- return -2
- max__ = 0
- tmp = 0
- for i in range(len(sw1)):
- for j in range(len(sw2)):
- tmp = self.calcGlossarySim(sw1[i], sw2[j])
- max__ = max(max__, tmp)
- return max__
|