cdZWj
/
en2cn


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
							#!/usr/bin/env/python
# -*- coding:utf-8 -*-


"""
文本语义相似度
sts([
    ('看图猜一电影名', '看图猜电影'),
    ('无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'),
    ('北京到上海的动车票', '上海到北京的动车票'),
    ])
"""

import re
import time
from Utils.util import phrase_classify
from concurrent.futures import ThreadPoolExecutor
from my_config import sts, pos, dict_tags


def batch_tag(hw_list, ans_given):
    """
    批量贴标签
    :param hw_list:
    :param ans_given:
    :return:
    """
    all_tags_with_str = {}
    # 多进程获取词性标注
    hw_l = [len(i) for i in hw_list]
    ans_given_choosed = []
    if len(ans_given) > 80:  # 选择部分进行词性标注
        ans_given_choosedbylen = [a for a in ans_given if len(a) in hw_l]
        num = 80 - len(ans_given_choosedbylen)
        ans_given_choosed.extend(ans_given_choosedbylen[:80])
        if num > 0:
            ans_given_choosed.extend(list(set(ans_given) - set(ans_given_choosedbylen))[:num])
    else:
        ans_given_choosed = ans_given

    all_ch = hw_list.copy()
    all_ch.extend(ans_given_choosed)
    all_ch = list(set(all_ch))
    print("all_ch:::", all_ch)
    with ThreadPoolExecutor(max_workers=3) as executor:
        for future in executor.map(pos_tag_han, all_ch):
            all_tags_with_str[future[0]] = future[1]
    return all_tags_with_str


def groups_choose(hw_list, ans_given):
    """
    相似度计算个数限定
    :param hw_list:
    :param ans_given:
    :return:
    """
    if len(hw_list) * len(ans_given) > 100:
        length_hw = [len(h.strip()) for h in hw_list]
        new_anss = [a for a in ans_given if len(a) in length_hw]
        anss_rest = [a for a in ans_given if len(a) not in length_hw]
        if len(hw_list) * len(new_anss) <= 90:
            new_anss2 = [a for a in anss_rest if (len(a) > 2 and len(a)-1 in length_hw) or len(a)+1 in length_hw]
            if len(hw_list) * (len(new_anss) + len(new_anss2)) > 100:
                new_anss.extend(new_anss2[:int(100/len(hw_list))-len(new_anss)])
            elif len(hw_list) * (len(new_anss) + len(new_anss2)) < 50:
                new_anss.extend(anss_rest[:int(100 / len(hw_list)) - len(new_anss)])
        elif len(hw_list) * len(new_anss) > 100:
            new_anss = new_anss[:int(100/len(hw_list))]
        return new_anss
    return ans_given


def han_similarity(en, hw_list, ans_given, cutted_words, is_token=0):
    """
    hanlp 短文本相似度
    含"的"时,判断一下词性
    is_token:表示hw_list是否为分词的结果
    :return:
    """
    scores_byrow = []
    scores_byrow_rawshape = []
    num_per_row = []
    all_groups = []
    part_of_speech_s = []
    double_groups_locate = {}
    # ----------------先限制下相似度计算个数：<=100----------
    ans_given = groups_choose(hw_list, ans_given)
    # -------------------------------------------------------
    st5 = time.time()
    # 一起标注
    all_hw_tags = pos_tag_han(hw_list, flag="by_list")
    all_ans_tags = pos_tag_han(ans_given, flag="by_list")
    # print(all_hw_tags, ans_given, all_ans_tags)
    # 多进程获取词性标注
    all_tags_with_str = batch_tag(hw_list, ans_given)
    print("111111词性标注时间：：", time.time() - st5)
    print("m*n:::", len(hw_list), hw_list, len(ans_given))
    # ---------------------------------------------
    for idi, hw in enumerate(hw_list):
        # print(idi, len(hw_list))
        print("is_token:", is_token)
        if is_token and all_hw_tags[idi] in ["d", "u", "ud", "c"]:  # 分词时暂不考虑副词
            continue

        hw = re.sub(r"^([\u4e00-\u9fa5])\1", r"\1", hw)  # 例：速率率
        hw = re.sub(r"^([使得有我令对让向将和与]+)…+$", r"\1", hw)
        row_groups = []
        # part_of_speech_s = []
        row_groups_locate = []
        for idj, ans in enumerate(ans_given):
            double_groups = 1
            # print(ans, pos_tag_han(ans), hw, pos_tag_han(hw))
            row_groups.append((hw, ans))
            if cutted_words and len(en.split(" ")) > 1:
                ans0, hw0 = ans, hw
                is_repl = 0
                for j in cutted_words[::-1]:  # hw的分词结果
                    if j+";" in ans + ";":
                        ans0 = re.sub(j+"$", "", ans0)
                        hw0 = re.sub(j+"$", "", hw0)
                        is_repl = 1
                if is_repl and ans0 and hw0:
                    row_groups.append((hw0, ans0))
                    double_groups += 1

            if "…" in hw or "…" in ans:
                row_groups.append((hw.replace("…", ""), ans.replace("…", "")))
                double_groups += 1
            if re.sub(r"(.+)的$", r"\1", ans) == hw and all_hw_tags[idi] != "a":
                part_of_speech_s.append(0)
            if re.search(r"(.+)的$", ans) and re.search(r"(.+)的$", hw) is None or (
                    re.search(r"(.+)的$", hw) and re.search(r"(.+)的$", ans) is None and all_ans_tags[idj] != 'a'):
                part_of_speech_s.extend([0]*double_groups)
            elif re.search(r".{2,}地$", ans):
                # if pos_tag_han(ans) in ["d", "v"] and pos_tag_han(hw) not in ["d", "v"]:
                if all_ans_tags[idj] in ["d", "v"] and all_hw_tags[idi] not in ["d", "v"]:
                    part_of_speech_s.extend([0]*double_groups)
                elif re.search(r"(.+)地$", ans) is None and (all_hw_tags[idi] == 'a'or all_ans_tags[idj] != 'n'):
                    part_of_speech_s.extend([0]*double_groups)
                elif re.search(r"(.+)的$", hw):
                    part_of_speech_s.extend([0]*double_groups)
                else:
                    part_of_speech_s.extend([1]*double_groups)
            elif ans[-1] in ["地", "的"] and hw[-1] in ["地", "的"] and ans[-1] != hw[-1]:
                part_of_speech_s.extend([0]*double_groups)
            elif ans[-1] not in ["地", "的"] and hw[-1] in ["地", "的"] and ans == hw[:-1]:
                part_of_speech_s.extend([0]*double_groups)
            elif "的人" in ans and all_ans_tags[idj] == 'n' and all_hw_tags[idi] == 'a' \
                    or ("的人" in hw and all_hw_tags[idi] == 'n' and all_ans_tags[idj] == 'a'):
                part_of_speech_s.extend([0]*double_groups)
            elif "…" + hw + ";" in ans+";" and phrase_classify(en) in ["prep-phrase", "v-phrase"]:
                part_of_speech_s.extend([0]*double_groups)
            elif re.search("[()（）]", hw) is None and re.search(hw+"…+[\u4e00-\u9fa5]", ans):
                part_of_speech_s.extend([0] * double_groups)
            elif all_ans_tags[idj] == 'v' and re.search("^使", ans):
                row_groups.append((hw, re.sub("^使", "", ans)))
                double_groups += 1
                part_of_speech_s.extend([1] * double_groups)
            else:
                hw_pos = [[all_hw_tags[idi]]]  # hw in dict_tags:hw_pos = [pos([hw])]
                ans_pos = [[all_ans_tags[idj]]]
                if hw not in dict_tags and hw in all_tags_with_str:
                    # if hw not in all_tags_with_str:
                    #     a_pos = pos(hw)
                    #     all_tags_with_str[hw] = a_pos
                    hw_pos.extend(all_tags_with_str[hw])
                if ans not in dict_tags and ans in all_tags_with_str:
                    # ans_pos = [pos([ans])]
                    # ans_pos = [[all_ans_tags[idj]]]
                    # if ans not in all_tags_with_str:
                    #     b_pos = pos(ans)
                    #     all_tags_with_str[ans] = b_pos
                    ans_pos.extend(all_tags_with_str[ans])
                # print(hw_pos, ans_pos)
                if all([True if i not in sum(ans_pos, []) else False for i in sum(hw_pos, [])]):  # 词性不同，如a和v,逻辑待定
                    part_of_speech_s.extend([0]*double_groups)
                else:
                    part_of_speech_s.extend([1]*double_groups)
            if not row_groups_locate:
                row_groups_locate.append([0, double_groups])
            else:
                bef = row_groups_locate[-1][1]
                row_groups_locate.append([bef, bef + double_groups])
        double_groups_locate[idi] = row_groups_locate
        # print("row_groups:", row_groups)
        num_per_row.append(len(row_groups))
        all_groups.extend(row_groups)
        # 开始是每个hw与多个参考答案为一组算相似度
        # simi_score = sts(row_groups)
        # print("part_of_speech_s:", part_of_speech_s)
        # print("simi_score[{}]:".format(row_groups), simi_score)
        # scores_byrow.append(simi_score)
        # if 0 in part_of_speech_s:
        #     scores_byrow[-1] = list(map(lambda x, y: x * y, scores_byrow[-1], part_of_speech_s))
    # 所有组合一起算相似度
    print("词性标注所花时间:", time.time()-st5)
    st1 = time.time()
    print(all_groups)
    # print(part_of_speech_s)
    if all_groups:
        simi_score = sts(all_groups)
        print("99999999时间：", time.time() - st1)
        # print(num_per_row)
        # --------------局部更新相似度---------------------------
        # ------------对相似度>0.9的单词对调换位置重新计算相似度------------
        groups_09 = [[i, all_groups[i]] for i, j in enumerate(simi_score) if j > 0.9 and part_of_speech_s[i]]
        rejudge_groups = [(s[1][1], s[1][0]) for s in groups_09]
        print("rejudge_groups>0.9:", rejudge_groups)
        if groups_09 and len(rejudge_groups) <= 3 and len(re.findall("[a-zA-Z'\-\(\)（）]+", en.strip())) == 1:
            simi_score_reversed = sts(rejudge_groups)
            simi_score_le_09 = [i for i, si in enumerate(simi_score_reversed) if si < 0.6]
            if simi_score_le_09:
                for i in simi_score_le_09:
                    simi_score[groups_09[i][0]] = simi_score_reversed[i]
        # -------------对相似度>0.8的词对调换位置重新计算相似度，最后选取最大的-------------
        elif not groups_09:
            groups_08 = [[i, all_groups[i]] for i, j in enumerate(simi_score) if j > 0.8 and part_of_speech_s[i]]
            rejudge_groups = [(s[1][1], s[1][0]) for s in groups_08]
            print("rejudge_groups>0.8:", rejudge_groups)
            simi_score_reversed = sts(rejudge_groups)
            simi_score_ge_08 = [i for i, si in enumerate(simi_score_reversed) if si > 0.9]
            if simi_score_ge_08:
                for i in simi_score_ge_08:
                    simi_score[groups_08[i][0]] = simi_score_reversed[i]
        # ------------------------------------------------------------------------------
        idx = 0
        for n in num_per_row:
            score_a_row = simi_score[idx: idx + n]
            if 0 in part_of_speech_s[idx: idx + n]:
                score_a_row = list(map(lambda x, y: x * y, score_a_row, part_of_speech_s[idx: idx + n]))
            idx += n
            scores_byrow.append(score_a_row)
        # ---------------按原来的group—shape还原-------------
        if not is_token and len(double_groups_locate) == len(scores_byrow) and all(
                [True if k1 < len(scores_byrow) and v1[-1][1] == len(scores_byrow[k1]) else False for k1, v1
                 in double_groups_locate.items()]):
            for idn, row in enumerate(list(double_groups_locate.values())):
                new_row = []
                score_row = scores_byrow[idn]
                if any([True for r in row if r[1] - r[0] > 1]):
                    for r in row:
                        if r[1] - r[0] > 1:
                            new_row.append(max(score_row[r[0]: r[1]]))
                        else:
                            new_row.append(score_row[r[0]])
                else:
                    new_row = score_row
                scores_byrow_rawshape.append(new_row)
    # print(scores_byrow)
    return scores_byrow, scores_byrow_rawshape


def pos_tag_han(w, flag="by_str"):
    """
    词性标注
    :return:
    """
    # print(hanlp.pretrained.pos.ALL)  # 打印所有的训练数据
    if flag == "by_str":
        return w, pos(w)
    if type(w) == str:
        return pos([w])[0]
    elif type(w) == list:
        return pos(w)


if __name__ == '__main__':
    import time
    t1 = time.time()
    # print(pos_tag_han(['环境友好的'], flag="by_list"))
    simi_score = sts([('画廊', '美术画廊'), ('美术画廊', '画廊')])
    print(simi_score)
    # '使物质分解', '破裂', '分解', '消除', '损坏', '机器或车辆出毛病', '讨论、关系或系统失败'
    # '破除', '感情失控（痛哭起来）', '感情失控', '感情失控痛哭起来',
    # han_similarity(['看图猜一电影名', '看图猜电影'],['北京到上海的动车票', '上海到北京的动车票'])
    # a = han_similarity(['抛弃'],['破除','捣毁', '拆除', '破除障碍或偏见',  '破除（障碍或偏见）'])
    # # # a = sts([('特别地','尤其'),('特别地','特别')])
    # print(a)
    # a = han_similarity("effective measure", ['有效的措施'], ['有效措施'], is_token=0)
    # print(a)
    #
    # # print(time.time()-t1)
    a = pos("草拟")
    # print(pos_tag_han("范畴"))
    # a.append(pos_tag("明白"))
    print(a)
    # print(pos_tag(["明白"]))
    # print(pos_tag("明白"))
    # print(pos("知道"))
    # print(pos("明白"))
    # a = pos(["必"])
    # b = pos(["分解"])
    # c = pos(["不是"],)
    # print(c)
    # b1 = [['j'], ['v'], ['v'], ['v']]
    # b2 = [['j'], ['v'], ['n']]
    # print(all([True if i not in b1 else False for i in b2]))
    # aa = ['“垃圾', '废弃物无用的东西', '乌七八糟的东西', '垃圾', '废物']
    # pos_tag_han(aa)
    # rrr = pos_tag_han("不想要的", flag="by_list")
    # print(rrr)