howNet.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
  1. # encoding:utf-8
  2. '''
  3. Created on 2016年9月7日
  4. @author: liuyu
  5. Last updated: 2018年2月25日
  6. @editor: yaleimeng
  7. @主要改动:
  8. 1,修复了读取词库遗漏的bug,不重复词从53251增加到53336个
  9. 2,对于不够Pythonic的代码按python风格改写,
  10. 3,按Python风格改写后,省去了几个多余函数。可读性更好
  11. '''
  12. import math
  13. import os
  14. def parseZhAndEn(text):
  15. words = text.split('|')
  16. if len(words) == 2:
  17. return words[1], words[0]
  18. else:
  19. return text, text
  20. class GlossaryElement:
  21. ''' #词汇表条目 '''
  22. def __init__(self):
  23. self.word = '' # 词
  24. self.type = '' # 词性
  25. self.solid = False # 实词/虚词
  26. self.s_first = '' # 第一基本义原
  27. self.s_other = [] # 其他义原
  28. self.s_relation = {} # 关系义原
  29. self.s_symbol = {} # 符号义原
  30. def dump(self):
  31. print(self.word + ',' + self.type + ', | first:' + self.s_first + ' | other:')
  32. for i in range(len(self.s_other)):
  33. print(self.s_other[i] + ',')
  34. print(' | relation:')
  35. for it in self.s_relation.keys():
  36. print(it + '=' + self.s_relation[it] + ',')
  37. print(' | symbol:')
  38. for it in self.s_symbol.keys():
  39. print(it + '=' + self.s_symbol[it] + ',')
  40. print('\n')
  41. def parse(self, text):
  42. line = text
  43. if not line.strip(): # 如果本行为空,则返回False。不为空,则进行解析
  44. return False
  45. items = line.split('/')
  46. if len(items) == 3:
  47. self.word = items[0]
  48. self.type = items[1]
  49. if line[0] != '{':
  50. self.solid = True
  51. else:
  52. self.solid = False
  53. line = line[1:len(line) - 2]
  54. sememes = items[2].split(',')
  55. if len(sememes) > 0:
  56. firstdone = False
  57. if sememes[0][0].isalpha():
  58. self.s_first, defaultText = parseZhAndEn(sememes[0])
  59. firstdone = True
  60. for i in range(len(sememes)):
  61. if 0 == i and firstdone:
  62. continue
  63. firstletter = sememes[i][0]
  64. if '(' == firstletter:
  65. self.s_other.append(sememes[i])
  66. continue
  67. equalpos = sememes[i].find('=')
  68. if equalpos != -1:
  69. key = sememes[i][0:equalpos]
  70. value = sememes[i][equalpos + 1]
  71. if len(value) > 0 and value[0] != '(':
  72. value, defaultText = parseZhAndEn(value)
  73. self.s_relation[key] = value
  74. continue
  75. if firstletter.isalpha() == False:
  76. value = sememes[i][1:]
  77. if len(value) > 0 and value[0] != '(':
  78. value, defaultText = parseZhAndEn(value)
  79. self.s_symbol[firstletter] = value
  80. continue
  81. self.s_other.append(sememes[i])
  82. # self.dump()
  83. return True
  84. return False
  85. class SememeElement:
  86. ''' 义原条目 '''
  87. def __init__(self):
  88. self.id = -1 # 编号
  89. self.father = -1 # 英文义原
  90. self.sememe_zh = '' # 中文义原
  91. self.sememe_en = '' # 父义原编号
  92. def parse(self, line):
  93. if not line: # 如果当前行为空,不解析,返回False
  94. return False
  95. items = line.split()
  96. if len(items) == 3:
  97. self.id = items[0]
  98. self.father = items[2]
  99. self.sememe_zh, self.sememe_en = parseZhAndEn(items[1])
  100. return True
  101. return False
  102. def valuesOfGlossarytable_(glossarytable_, word):
  103. values_ = []
  104. for key_, v_ in glossarytable_.items():
  105. key_ = key_.split('\t')[1]
  106. if key_ == word:
  107. values_.append(v_)
  108. return values_
  109. class How_Similarity:
  110. def __init__(self):
  111. self.sememetable_ = dict() # 义原表
  112. self.sememeindex_zn_ = dict() # 义原索引(中文)
  113. self.glossarytable_ = dict() # 词汇表。
  114. self.glossaryfile = os.path.join(os.path.dirname(__file__), 'glossary.txt')
  115. self.sememefile = os.path.join(os.path.dirname(__file__), 'whole.dat')
  116. self.vocab = set()
  117. self.BETA = [0.5, 0.2, 0.17, 0.13]
  118. self.GAMA = 0.2
  119. self.DELTA = 0.2
  120. self.ALFA = 1.6
  121. self.init()
  122. def init(self):
  123. ''' 初始化义原和词汇表 '''
  124. if self.loadSememeTable(self.sememefile) == False:
  125. print("[ERROR] %s 加载失败.", self.sememefile)
  126. return False
  127. if self.loadGlossary(self.glossaryfile) == False:
  128. print("[ERROR] %s 加载失败.", self.glossaryfile)
  129. return False
  130. return True
  131. def loadSememeTable(self, filename):
  132. with open(filename, 'rt', encoding='utf-8') as reader:
  133. try:
  134. lines = reader.readlines()
  135. for line in lines:
  136. if line.strip(): # 如果当前行不为空
  137. ele = SememeElement()
  138. if ele.parse(line):
  139. self.sememetable_[ele.id] = ele
  140. self.sememeindex_zn_[ele.sememe_zh] = ele
  141. except Exception as e:
  142. print('function loadSememeTable has Errors!!')
  143. print(e)
  144. return False
  145. return True
  146. def loadGlossary(self, filename):
  147. ''' 加载词汇表 '''
  148. with open(filename, 'r', encoding='utf-8') as reader:
  149. try:
  150. lines = reader.readlines()
  151. if not lines: # 从 lines = [] 改为 not,更pythonic
  152. return False
  153. count = 0
  154. for line in lines:
  155. if not line.strip(): # empty函数 == False 改得更pythonic。
  156. continue # 使用continue,减小嵌套深度
  157. ele = GlossaryElement()
  158. if ele.parse(line):
  159. self.glossarytable_[str(count) + '\t' + ele.word] = ele
  160. self.vocab.add(ele.word)
  161. count += 1
  162. # print('function loadGlossary has been completed!!')
  163. except Exception as e:
  164. print('function loadGlossary has errors!!', e)
  165. return False
  166. return True
  167. def getSememeByID(self, id_):
  168. ''' 根据编号获取义原 '''
  169. if id_ in self.sememetable_.keys():
  170. return self.sememetable_[id_]
  171. return None
  172. def getSememeByZh(self, word):
  173. ''' 根据汉词获取义原 '''
  174. if word in self.sememeindex_zn_.keys():
  175. return self.sememeindex_zn_[word]
  176. return None
  177. def getGlossary(self, word):
  178. ''' 获取词汇表中的词 '''
  179. if word in self.vocab:
  180. return valuesOfGlossarytable_(self.glossarytable_, word)
  181. return None
  182. def calcGlossarySim(self, w1, w2):
  183. ''' 计算词汇表中两个词的相似度 '''
  184. if w1 == None or w2 == None: return 0.0
  185. if w1.solid != w2.solid: return 0.0
  186. sim1 = self.calcSememeSimFirst(w1, w2)
  187. sim2 = self.calcSememeSimOther(w1, w2)
  188. sim3 = self.calcSememeSimRelation(w1, w2)
  189. sim4 = self.calcSememeSimSymbol(w1, w2)
  190. sim = self.BETA[0] * sim1 + self.BETA[1] * sim1 * sim2 + self.BETA[2] * sim1 * sim2 * sim3 + (
  191. self.BETA[3] * sim1 * sim2 * sim3 * sim4)
  192. return sim
  193. def calcSememeSim(self, w1, w2, ):
  194. ''' 计算两个义原之间的相似度 '''
  195. if not w1 and not w2:
  196. return 1.0
  197. if not w1 or not w2:
  198. return self.DELTA
  199. if w1 == w2:
  200. return 1.0
  201. d = self.calcSememeDistance(w1, w2) # 这里是刘群的相似度计算公式。
  202. if d >= 0:
  203. return self.ALFA / (self.ALFA + d)
  204. else:
  205. return -1.0
  206. def weight(self, i):
  207. left = 1 - i / 13
  208. PI = 3.1415926536
  209. right = 1 + math.sin(i * PI / 45)
  210. return left * right
  211. def calcSememeDistance(self, w1, w2): # 论文修改了距离函数。
  212. '''
  213. 计算义原之间的距离(义原树中两个节点之间的距离)
  214. '''
  215. s1 = self.getSememeByZh(w1)
  216. s2 = self.getSememeByZh(w2)
  217. if s1 == None or s2 == None:
  218. return -1.0
  219. fatherpath = []
  220. id1, id2 = s1.id, s2.id
  221. father1, father2 = s1.father, s2.father
  222. while (id1 != father1): # 追溯 s1的上层词。
  223. fatherpath.append(id1)
  224. id1 = father1
  225. father_ = self.getSememeByID(father1)
  226. if father_:
  227. father1 = father_.father
  228. fatherpath.append(id1)
  229. len_ = 0.0
  230. while (id2 != father2):
  231. if id2 in fatherpath:
  232. father_pos = fatherpath.index(id2)
  233. return self.weight(father_pos) + len_
  234. id2 = father2
  235. father_ = self.getSememeByID(father2)
  236. if father_:
  237. father2 = father_.father
  238. len_ = len_ + self.weight(1)
  239. if id2 in fatherpath:
  240. father_pos = fatherpath.index(id2)
  241. return self.weight(father_pos) + len_
  242. return 20.0
  243. def calcSememeSimFirst(self, w1, w2):
  244. ''' 计算第一基本义原之间的相似度 '''
  245. return self.calcSememeSim(w1.s_first, w2.s_first)
  246. def calcSememeSimOther(self, w1, w2):
  247. ''' 计算其他义原之间的相似度 '''
  248. if w1.s_other == [] and w2.s_other == []:
  249. return 1.0
  250. sum_ = 0.0
  251. for i in range(len(w1.s_other)):
  252. maxTemp = -1.0
  253. for j in range(len(w2.s_other)):
  254. temp = 0.0
  255. if w1.s_other[i][0] != '(' and w2.s_other[j][0] != '(':
  256. temp = self.calcSememeSim(w1.s_other[i], w2.s_other[j])
  257. elif w1.s_other[i][0] == '(' and w2.s_other[j][0] == '(':
  258. if w1.s_other[i] == w2.s_other[j]:
  259. temp = 1.0
  260. else:
  261. maxTemp = 0.0
  262. else:
  263. temp = self.GAMA
  264. if temp > maxTemp:
  265. maxTemp = temp
  266. if maxTemp == -1.0: # there is no element in w2.s_other
  267. maxTemp = self.DELTA
  268. sum_ = sum_ + maxTemp
  269. if len(w1.s_other) < len(w2.s_other):
  270. sum_ = sum_ + (len(w2.s_other) - len(w1.s_other)) * self.DELTA
  271. return sum_ / max(len(w1.s_other), len(w2.s_other))
  272. def calcSememeSimRelation(self, w1, w2):
  273. ''' 计算关系义原之间的相似度 '''
  274. if w1.s_relation == {} and w2.s_relation == {}:
  275. return 1.0
  276. sum_ = 0.0
  277. for it1 in w1.s_relation.keys():
  278. maxTemp = 0.0
  279. temp = 0.0
  280. if it1 in w2.s_relation.keys():
  281. if w1.s_relation[it1][0] != '(' and w2.s_relation[it1][0] != '(':
  282. temp = self.calcSememeSim(w1.s_relation[it1], w2.s_relation[it1])
  283. elif w1.s_relation[it1][0] == '(' and w2.s_relation[it1][0] == '(':
  284. if w1.s_relation[it1] == w2.s_relation[it1]:
  285. temp = 1.0
  286. else:
  287. maxTemp = 0.0
  288. else:
  289. temp = self.GAMA
  290. else:
  291. maxTemp = self.DELTA
  292. if temp > maxTemp:
  293. maxTemp = temp
  294. sum_ = sum_ + maxTemp
  295. if len(w1.s_relation) < len(w2.s_relation):
  296. sum_ = sum_ + (len(w2.s_relation) - len(w1.s_relation)) * self.DELTA
  297. return sum_ / max(len(w1.s_relation), len(w2.s_relation))
  298. def calcSememeSimSymbol(self, w1, w2):
  299. ''' 计算符号义原之间的相似度 '''
  300. if w1.s_symbol == {} and w2.s_symbol == {}:
  301. return 1.0
  302. sum_ = 0.0
  303. for it1 in w1.s_symbol.keys():
  304. maxTemp = 0.0
  305. temp = 0.0
  306. if it1 in w2.s_symbol.keys():
  307. if w1.s_symbol[it1][0] != '(' and w2.s_symbol[it1][0] != '(':
  308. temp = self.calcSememeSim(w1.s_symbol[it1], w2.s_symbol[it1])
  309. elif w1.s_symbol[it1][0] == '(' and w2.s_symbol[it1][0] == '(':
  310. if w1.s_symbol[it1] == w2.s_symbol[it1]:
  311. temp = 1.0
  312. else:
  313. maxTemp = 0.0
  314. else:
  315. temp = self.GAMA
  316. else:
  317. maxTemp = self.DELTA
  318. if temp > maxTemp:
  319. maxTemp = temp
  320. sum_ = sum_ + maxTemp
  321. if len(w1.s_symbol) < len(w2.s_symbol):
  322. sum_ = sum_ + (len(w2.s_symbol) - len(w1.s_symbol)) * self.DELTA
  323. return sum_ / max(len(w1.s_symbol), len(w2.s_symbol))
  324. def calc(self, w1, w2, ):
  325. '''
  326. 计算两个词的语义相似度(返回值: [0, 1], -2:指定的词词典中不存在)
  327. '''
  328. if w1 == w2:
  329. return 1
  330. sw1 = self.getGlossary(w1) # 获取词表。
  331. sw2 = self.getGlossary(w2)
  332. if sw1 == None or sw2 == None or len(sw1) <= 0 or len(sw2) <= 0:
  333. return -2
  334. max__ = 0
  335. tmp = 0
  336. for i in range(len(sw1)):
  337. for j in range(len(sw2)):
  338. tmp = self.calcGlossarySim(sw1[i], sw2[j])
  339. max__ = max(max__, tmp)
  340. return max__