cdZWj
/
en2cn


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
							# encoding:utf-8
'''
Created on 2016年9月7日
@author: liuyu

Last updated: 2018年2月25日
@editor: yaleimeng
@主要改动：
1，修复了读取词库遗漏的bug，不重复词从53251增加到53336个
2，对于不够Pythonic的代码按python风格改写，
3，按Python风格改写后，省去了几个多余函数。可读性更好
'''
import math
import os


def parseZhAndEn(text):
    words = text.split('|')
    if len(words) == 2:
        return words[1], words[0]
    else:
        return text, text


class GlossaryElement:
    '''    #词汇表条目    '''

    def __init__(self):
        self.word = ''  # 词
        self.type = ''  # 词性
        self.solid = False  # 实词/虚词
        self.s_first = ''  # 第一基本义原
        self.s_other = []  # 其他义原
        self.s_relation = {}  # 关系义原
        self.s_symbol = {}  # 符号义原

    def dump(self):
        print(self.word + ',' + self.type + ', | first:' + self.s_first + ' | other:')
        for i in range(len(self.s_other)):
            print(self.s_other[i] + ',')

        print(' | relation:')
        for it in self.s_relation.keys():
            print(it + '=' + self.s_relation[it] + ',')

        print(' | symbol:')
        for it in self.s_symbol.keys():
            print(it + '=' + self.s_symbol[it] + ',')

        print('\n')

    def parse(self, text):
        line = text
        if not line.strip():  # 如果本行为空，则返回False。不为空，则进行解析
            return False
        items = line.split('/')
        if len(items) == 3:
            self.word = items[0]
            self.type = items[1]
            if line[0] != '{':
                self.solid = True
            else:
                self.solid = False
                line = line[1:len(line) - 2]

            sememes = items[2].split(',')

            if len(sememes) > 0:
                firstdone = False
                if sememes[0][0].isalpha():
                    self.s_first, defaultText = parseZhAndEn(sememes[0])
                    firstdone = True

                for i in range(len(sememes)):
                    if 0 == i and firstdone:
                        continue

                    firstletter = sememes[i][0]
                    if '(' == firstletter:
                        self.s_other.append(sememes[i])
                        continue
                    equalpos = sememes[i].find('=')
                    if equalpos != -1:
                        key = sememes[i][0:equalpos]
                        value = sememes[i][equalpos + 1]
                        if len(value) > 0 and value[0] != '(':
                            value, defaultText = parseZhAndEn(value)
                        self.s_relation[key] = value
                        continue

                    if firstletter.isalpha() == False:
                        value = sememes[i][1:]
                        if len(value) > 0 and value[0] != '(':
                            value, defaultText = parseZhAndEn(value)
                        self.s_symbol[firstletter] = value
                        continue
                    self.s_other.append(sememes[i])
            # self.dump()
            return True
        return False


class SememeElement:
    '''    义原条目    '''

    def __init__(self):
        self.id = -1  # 编号
        self.father = -1  # 英文义原
        self.sememe_zh = ''  # 中文义原
        self.sememe_en = ''  # 父义原编号

    def parse(self, line):
        if not line:  # 如果当前行为空，不解析，返回False
            return False
        items = line.split()
        if len(items) == 3:
            self.id = items[0]
            self.father = items[2]
            self.sememe_zh, self.sememe_en = parseZhAndEn(items[1])
            return True
        return False


def valuesOfGlossarytable_(glossarytable_, word):
    values_ = []
    for key_, v_ in glossarytable_.items():
        key_ = key_.split('\t')[1]
        if key_ == word:
            values_.append(v_)
    return values_


class How_Similarity:

    def __init__(self):
        self.sememetable_ = dict()  # 义原表
        self.sememeindex_zn_ = dict()  # 义原索引(中文)
        self.glossarytable_ = dict()  # 词汇表。
        self.glossaryfile = os.path.join(os.path.dirname(__file__), 'glossary.txt')
        self.sememefile = os.path.join(os.path.dirname(__file__), 'whole.dat')
        self.vocab = set()
        self.BETA = [0.5, 0.2, 0.17, 0.13]
        self.GAMA = 0.2
        self.DELTA = 0.2
        self.ALFA = 1.6
        self.init()

    def init(self):
        '''        初始化义原和词汇表        '''
        if self.loadSememeTable(self.sememefile) == False:
            print("[ERROR] %s 加载失败.", self.sememefile)
            return False
        if self.loadGlossary(self.glossaryfile) == False:
            print("[ERROR] %s 加载失败.", self.glossaryfile)
            return False
        return True

    def loadSememeTable(self, filename):
        with open(filename, 'rt', encoding='utf-8') as reader:
            try:
                lines = reader.readlines()
                for line in lines:
                    if line.strip():  # 如果当前行不为空
                        ele = SememeElement()
                        if ele.parse(line):
                            self.sememetable_[ele.id] = ele
                            self.sememeindex_zn_[ele.sememe_zh] = ele
            except Exception as e:
                print('function loadSememeTable has Errors!!')
                print(e)
                return False
        return True

    def loadGlossary(self, filename):
        '''        加载词汇表        '''
        with open(filename, 'r', encoding='utf-8') as reader:
            try:
                lines = reader.readlines()
                if not lines:  # 从 lines = [] 改为 not，更pythonic
                    return False
                count = 0
                for line in lines:
                    if not line.strip():  # empty函数 == False 改得更pythonic。
                        continue  # 使用continue，减小嵌套深度
                    ele = GlossaryElement()
                    if ele.parse(line):
                        self.glossarytable_[str(count) + '\t' + ele.word] = ele
                        self.vocab.add(ele.word)
                        count += 1
                # print('function loadGlossary has been completed!!')
            except Exception as e:
                print('function loadGlossary has errors!!', e)
                return False
        return True

    def getSememeByID(self, id_):
        '''      根据编号获取义原        '''
        if id_ in self.sememetable_.keys():
            return self.sememetable_[id_]
        return None

    def getSememeByZh(self, word):
        '''      根据汉词获取义原        '''
        if word in self.sememeindex_zn_.keys():
            return self.sememeindex_zn_[word]
        return None

    def getGlossary(self, word):
        '''      获取词汇表中的词        '''
        if word in self.vocab:
            return valuesOfGlossarytable_(self.glossarytable_, word)
        return None

    def calcGlossarySim(self, w1, w2):
        '''      计算词汇表中两个词的相似度        '''
        if w1 == None or w2 == None: return 0.0

        if w1.solid != w2.solid: return 0.0

        sim1 = self.calcSememeSimFirst(w1, w2)
        sim2 = self.calcSememeSimOther(w1, w2)
        sim3 = self.calcSememeSimRelation(w1, w2)
        sim4 = self.calcSememeSimSymbol(w1, w2)

        sim = self.BETA[0] * sim1 + self.BETA[1] * sim1 * sim2 + self.BETA[2] * sim1 * sim2 * sim3 + (
                self.BETA[3] * sim1 * sim2 * sim3 * sim4)

        return sim

    def calcSememeSim(self, w1, w2, ):
        '''      计算两个义原之间的相似度        '''
        if not w1 and not w2:
            return 1.0
        if not w1 or not w2:
            return self.DELTA
        if w1 == w2:
            return 1.0

        d = self.calcSememeDistance(w1, w2)  # 这里是刘群的相似度计算公式。
        if d >= 0:
            return self.ALFA / (self.ALFA + d)
        else:
            return -1.0

    def weight(self, i):
        left = 1 - i / 13
        PI = 3.1415926536
        right = 1 + math.sin(i * PI / 45)
        return left * right

    def calcSememeDistance(self, w1, w2):  # 论文修改了距离函数。
        '''
       计算义原之间的距离(义原树中两个节点之间的距离)
        '''
        s1 = self.getSememeByZh(w1)
        s2 = self.getSememeByZh(w2)

        if s1 == None or s2 == None:
            return -1.0

        fatherpath = []
        id1, id2 = s1.id, s2.id
        father1, father2 = s1.father, s2.father

        while (id1 != father1):  # 追溯 s1的上层词。
            fatherpath.append(id1)
            id1 = father1
            father_ = self.getSememeByID(father1)
            if father_:
                father1 = father_.father

        fatherpath.append(id1)
        len_ = 0.0
        while (id2 != father2):
            if id2 in fatherpath:
                father_pos = fatherpath.index(id2)
                return self.weight(father_pos) + len_

            id2 = father2
            father_ = self.getSememeByID(father2)
            if father_:
                father2 = father_.father
            len_ = len_ + self.weight(1)

        if id2 in fatherpath:
            father_pos = fatherpath.index(id2)
            return self.weight(father_pos) + len_
        return 20.0

    def calcSememeSimFirst(self, w1, w2):
        '''        计算第一基本义原之间的相似度        '''
        return self.calcSememeSim(w1.s_first, w2.s_first)

    def calcSememeSimOther(self, w1, w2):
        '''        计算其他义原之间的相似度        '''
        if w1.s_other == [] and w2.s_other == []:
            return 1.0
        sum_ = 0.0

        for i in range(len(w1.s_other)):
            maxTemp = -1.0

            for j in range(len(w2.s_other)):
                temp = 0.0
                if w1.s_other[i][0] != '(' and w2.s_other[j][0] != '(':
                    temp = self.calcSememeSim(w1.s_other[i], w2.s_other[j])

                elif w1.s_other[i][0] == '(' and w2.s_other[j][0] == '(':
                    if w1.s_other[i] == w2.s_other[j]:
                        temp = 1.0
                    else:
                        maxTemp = 0.0
                else:
                    temp = self.GAMA

                if temp > maxTemp:
                    maxTemp = temp

            if maxTemp == -1.0:  # there is no element in w2.s_other
                maxTemp = self.DELTA

            sum_ = sum_ + maxTemp

        if len(w1.s_other) < len(w2.s_other):
            sum_ = sum_ + (len(w2.s_other) - len(w1.s_other)) * self.DELTA

        return sum_ / max(len(w1.s_other), len(w2.s_other))

    def calcSememeSimRelation(self, w1, w2):
        '''        计算关系义原之间的相似度        '''

        if w1.s_relation == {} and w2.s_relation == {}:
            return 1.0

        sum_ = 0.0
        for it1 in w1.s_relation.keys():
            maxTemp = 0.0
            temp = 0.0

            if it1 in w2.s_relation.keys():
                if w1.s_relation[it1][0] != '(' and w2.s_relation[it1][0] != '(':
                    temp = self.calcSememeSim(w1.s_relation[it1], w2.s_relation[it1])
                elif w1.s_relation[it1][0] == '(' and w2.s_relation[it1][0] == '(':
                    if w1.s_relation[it1] == w2.s_relation[it1]:
                        temp = 1.0
                    else:
                        maxTemp = 0.0
                else:
                    temp = self.GAMA
            else:
                maxTemp = self.DELTA

            if temp > maxTemp:
                maxTemp = temp

            sum_ = sum_ + maxTemp

        if len(w1.s_relation) < len(w2.s_relation):
            sum_ = sum_ + (len(w2.s_relation) - len(w1.s_relation)) * self.DELTA

        return sum_ / max(len(w1.s_relation), len(w2.s_relation))

    def calcSememeSimSymbol(self, w1, w2):
        '''        计算符号义原之间的相似度        '''
        if w1.s_symbol == {} and w2.s_symbol == {}:
            return 1.0

        sum_ = 0.0
        for it1 in w1.s_symbol.keys():
            maxTemp = 0.0
            temp = 0.0

            if it1 in w2.s_symbol.keys():
                if w1.s_symbol[it1][0] != '(' and w2.s_symbol[it1][0] != '(':
                    temp = self.calcSememeSim(w1.s_symbol[it1], w2.s_symbol[it1])
                elif w1.s_symbol[it1][0] == '(' and w2.s_symbol[it1][0] == '(':
                    if w1.s_symbol[it1] == w2.s_symbol[it1]:
                        temp = 1.0
                    else:
                        maxTemp = 0.0
                else:
                    temp = self.GAMA
            else:
                maxTemp = self.DELTA

            if temp > maxTemp:
                maxTemp = temp

            sum_ = sum_ + maxTemp

        if len(w1.s_symbol) < len(w2.s_symbol):
            sum_ = sum_ + (len(w2.s_symbol) - len(w1.s_symbol)) * self.DELTA

        return sum_ / max(len(w1.s_symbol), len(w2.s_symbol))

    def calc(self, w1, w2, ):
        '''
        计算两个词的语义相似度（返回值: [0, 1], -2:指定的词词典中不存在）
        '''
        if w1 == w2:
            return 1
        sw1 = self.getGlossary(w1)  # 获取词表。
        sw2 = self.getGlossary(w2)
        if sw1 == None or sw2 == None or len(sw1) <= 0 or len(sw2) <= 0:
            return -2

        max__ = 0
        tmp = 0
        for i in range(len(sw1)):
            for j in range(len(sw2)):
                tmp = self.calcGlossarySim(sw1[i], sw2[j])
                max__ = max(max__, tmp)

        return max__