Hybrid_Sim.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. # -*- coding: utf-8 -*-
  2. '''
  3. @author: yaleimeng@sina.com
  4. @license: (C) Copyright 2018
  5. @desc: 混合采用词林和知网的相似度计算方法。得到更加符合人们感觉的相似度数值。
  6. @DateTime: Created on 2018/1/25, at 上午 08:59 by PyCharm '''
  7. from .hownet.howNet import How_Similarity
  8. from .cilin.V3.ciLin import CilinSimilarity
  9. from .fanyi.anto_Judger import AntonymJudger
  10. class HybridSim():
  11. '''
  12. 混合相似度计算策略。使用了词林与知网词汇量的并集。扩大了词汇覆盖范围。
  13. '''
  14. ci_lin = CilinSimilarity() # 实例化词林相似度计算对象
  15. how_net = How_Similarity() # 实例化知网相似度计算对象
  16. Common = ci_lin.vocab & how_net.vocab
  17. A = how_net.vocab - ci_lin.vocab
  18. B = ci_lin.vocab - how_net.vocab
  19. anto = AntonymJudger()
  20. @classmethod
  21. def get_Middle_sim(cls, w1, w2):
  22. # 在python的类方法中,默认使用的第一个参数是cls
  23. # self表示一个具体的实例本身,cls表示这个类本身
  24. lin = cls.ci_lin.sim2018(w1, w2) if w1 in cls.ci_lin.vocab and w2 in cls.ci_lin.vocab else 0
  25. how = cls.how_net.calc(w1, w2) if w1 in cls.how_net.vocab and w2 in cls.how_net.vocab else 0
  26. if w1 in cls.Common and w2 in cls.Common: # 两个词都被词林和知网共同收录。
  27. print('两个词都被词林和知网共同收录。', end='\t')
  28. print(w1, w2, '词林改进版相似度:', lin, end='\t')
  29. print('知网相似度结果为:', how, end='\t')
  30. # return lin * 1 + how * 0 # 可以调节两者的权重,以获取更优结果!!
  31. # return lin * 0.4 + how * 0.6
  32. return max([lin, how])
  33. if w1 in cls.A and w2 in cls.A: # 两个词都只被知网收录。
  34. return how
  35. if w1 in cls.B and w2 in cls.B: # 两个词都只被词林收录。
  36. return lin
  37. if w1 in cls.A and w2 in cls.B: # 一个只被词林收录,另一个只被知网收录。
  38. print('触发策略三,左词为知网,右词为词林')
  39. same_words = cls.ci_lin.code_word[cls.ci_lin.word_code[w2][0]]
  40. if not same_words:
  41. return 0.2
  42. all_sims = [cls.how_net.calc(word, w1) for word in same_words]
  43. print(same_words, all_sims, end='\t')
  44. if max(all_sims) == 0.0 and -2 in all_sims:
  45. return -2 # 词典中不存在
  46. return max(all_sims)
  47. if w2 in cls.A and w1 in cls.B:
  48. print('触发策略三,左词为词林,右词为知网')
  49. same_words = cls.ci_lin.code_word[cls.ci_lin.word_code[w1][0]]
  50. if not same_words:
  51. return 0.2
  52. all_sims = [cls.how_net.calc(word, w2) for word in same_words]
  53. # print(w1, '词林同义词有:', same_words, all_sims, end='\t')
  54. if max(all_sims) == 0.0 and -2 in all_sims:
  55. return -2 # 词典中不存在
  56. return max(all_sims)
  57. if w1 in cls.A and w2 in cls.Common: # 左知网,右都有
  58. print('策略四(左知网):知网相似度结果为:', how)
  59. same_words = cls.ci_lin.code_word[cls.ci_lin.word_code[w2][0]]
  60. if not same_words:
  61. return how
  62. all_sims = [cls.how_net.calc(word, w1) for word in same_words]
  63. # print(w2, '词林同义词有:', same_words, all_sims, end='\t')
  64. return 0.6 * how + 0.4 * max(all_sims)
  65. if w2 in cls.A and w1 in cls.Common: # 右知网,左都有
  66. print('策略四(右知网):知网相似度结果为:', how)
  67. same_words = cls.ci_lin.code_word[cls.ci_lin.word_code[w1][0]]
  68. if not same_words:
  69. return how
  70. all_sims = [cls.how_net.calc(word, w2) for word in same_words]
  71. print(same_words, all_sims, end='\t')
  72. return 0.6 * how + 0.4 * max(all_sims)
  73. if w1 in cls.B and w2 in cls.Common: # 左词林,右都有
  74. print(w1, w2, '策略五(左词林):词林改进版相似度:', lin)
  75. same_words = cls.ci_lin.code_word[cls.ci_lin.word_code[w1][0]]
  76. if not same_words:
  77. return lin
  78. all_sims = [cls.how_net.calc(word, w2) for word in same_words]
  79. # print(w1, '词林同义词有:', same_words, all_sims, end='\t')
  80. return 0.6 * lin + 0.4 * max(all_sims)
  81. if w2 in cls.B and w1 in cls.Common: # 右词林,左都有
  82. print(w1, w2, '策略五(右词林):词林改进版相似度:', lin)
  83. same_words = cls.ci_lin.code_word[cls.ci_lin.word_code[w2][0]]
  84. if not same_words:
  85. return lin
  86. all_sims = [cls.how_net.calc(word, w1) for word in same_words]
  87. # print(w2, '词林同义词有:', same_words, all_sims, end='\t')
  88. return 0.6 * lin + 0.4 * max(all_sims)
  89. # print('对不起,词语可能未收录,无法计算相似度!')
  90. return -1
  91. @classmethod
  92. def get_Final_sim(cls, w1, w2):
  93. if cls.anto.is_anti_pair(w1, w2):
  94. return 1 - cls.get_Middle_sim(w1, w2)
  95. else:
  96. return cls.get_Middle_sim(w1, w2)
  97. if __name__ == '__main__':
  98. hb = HybridSim()
  99. # a = hb.get_Final_sim("废除", "我们")
  100. a = hb.get_Final_sim("日常", "例行")
  101. print("\n最后结果:", a)