#!/usr/bin/env/python # -*- coding:utf-8 -*- """ 文本语义相似度 sts([ ('看图猜一电影名', '看图猜电影'), ('无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'), ('北京到上海的动车票', '上海到北京的动车票'), ]) """ import re import time from Utils.util import phrase_classify from concurrent.futures import ThreadPoolExecutor from my_config import sts, pos, dict_tags def batch_tag(hw_list, ans_given): """ 批量贴标签 :param hw_list: :param ans_given: :return: """ all_tags_with_str = {} # 多进程获取词性标注 hw_l = [len(i) for i in hw_list] ans_given_choosed = [] if len(ans_given) > 80: # 选择部分进行词性标注 ans_given_choosedbylen = [a for a in ans_given if len(a) in hw_l] num = 80 - len(ans_given_choosedbylen) ans_given_choosed.extend(ans_given_choosedbylen[:80]) if num > 0: ans_given_choosed.extend(list(set(ans_given) - set(ans_given_choosedbylen))[:num]) else: ans_given_choosed = ans_given all_ch = hw_list.copy() all_ch.extend(ans_given_choosed) all_ch = list(set(all_ch)) print("all_ch:::", all_ch) with ThreadPoolExecutor(max_workers=3) as executor: for future in executor.map(pos_tag_han, all_ch): all_tags_with_str[future[0]] = future[1] return all_tags_with_str def groups_choose(hw_list, ans_given): """ 相似度计算个数限定 :param hw_list: :param ans_given: :return: """ if len(hw_list) * len(ans_given) > 100: length_hw = [len(h.strip()) for h in hw_list] new_anss = [a for a in ans_given if len(a) in length_hw] anss_rest = [a for a in ans_given if len(a) not in length_hw] if len(hw_list) * len(new_anss) <= 90: new_anss2 = [a for a in anss_rest if (len(a) > 2 and len(a)-1 in length_hw) or len(a)+1 in length_hw] if len(hw_list) * (len(new_anss) + len(new_anss2)) > 100: new_anss.extend(new_anss2[:int(100/len(hw_list))-len(new_anss)]) elif len(hw_list) * (len(new_anss) + len(new_anss2)) < 50: new_anss.extend(anss_rest[:int(100 / len(hw_list)) - len(new_anss)]) elif len(hw_list) * len(new_anss) > 100: new_anss = new_anss[:int(100/len(hw_list))] return new_anss return ans_given def han_similarity(en, hw_list, ans_given, cutted_words, is_token=0): """ hanlp 短文本相似度 含"的"时,判断一下词性 is_token:表示hw_list是否为分词的结果 :return: """ scores_byrow = [] scores_byrow_rawshape = [] num_per_row = [] all_groups = [] part_of_speech_s = [] double_groups_locate = {} # ----------------先限制下相似度计算个数:<=100---------- ans_given = groups_choose(hw_list, ans_given) # ------------------------------------------------------- st5 = time.time() # 一起标注 all_hw_tags = pos_tag_han(hw_list, flag="by_list") all_ans_tags = pos_tag_han(ans_given, flag="by_list") # print(all_hw_tags, ans_given, all_ans_tags) # 多进程获取词性标注 all_tags_with_str = batch_tag(hw_list, ans_given) print("111111词性标注时间::", time.time() - st5) print("m*n:::", len(hw_list), hw_list, len(ans_given)) # --------------------------------------------- for idi, hw in enumerate(hw_list): # print(idi, len(hw_list)) print("is_token:", is_token) if is_token and all_hw_tags[idi] in ["d", "u", "ud", "c"]: # 分词时暂不考虑副词 continue hw = re.sub(r"^([\u4e00-\u9fa5])\1", r"\1", hw) # 例:速率率 hw = re.sub(r"^([使得有我令对让向将和与]+)…+$", r"\1", hw) row_groups = [] # part_of_speech_s = [] row_groups_locate = [] for idj, ans in enumerate(ans_given): double_groups = 1 # print(ans, pos_tag_han(ans), hw, pos_tag_han(hw)) row_groups.append((hw, ans)) if cutted_words and len(en.split(" ")) > 1: ans0, hw0 = ans, hw is_repl = 0 for j in cutted_words[::-1]: # hw的分词结果 if j+";" in ans + ";": ans0 = re.sub(j+"$", "", ans0) hw0 = re.sub(j+"$", "", hw0) is_repl = 1 if is_repl and ans0 and hw0: row_groups.append((hw0, ans0)) double_groups += 1 if "…" in hw or "…" in ans: row_groups.append((hw.replace("…", ""), ans.replace("…", ""))) double_groups += 1 if re.sub(r"(.+)的$", r"\1", ans) == hw and all_hw_tags[idi] != "a": part_of_speech_s.append(0) if re.search(r"(.+)的$", ans) and re.search(r"(.+)的$", hw) is None or ( re.search(r"(.+)的$", hw) and re.search(r"(.+)的$", ans) is None and all_ans_tags[idj] != 'a'): part_of_speech_s.extend([0]*double_groups) elif re.search(r".{2,}地$", ans): # if pos_tag_han(ans) in ["d", "v"] and pos_tag_han(hw) not in ["d", "v"]: if all_ans_tags[idj] in ["d", "v"] and all_hw_tags[idi] not in ["d", "v"]: part_of_speech_s.extend([0]*double_groups) elif re.search(r"(.+)地$", ans) is None and (all_hw_tags[idi] == 'a'or all_ans_tags[idj] != 'n'): part_of_speech_s.extend([0]*double_groups) elif re.search(r"(.+)的$", hw): part_of_speech_s.extend([0]*double_groups) else: part_of_speech_s.extend([1]*double_groups) elif ans[-1] in ["地", "的"] and hw[-1] in ["地", "的"] and ans[-1] != hw[-1]: part_of_speech_s.extend([0]*double_groups) elif ans[-1] not in ["地", "的"] and hw[-1] in ["地", "的"] and ans == hw[:-1]: part_of_speech_s.extend([0]*double_groups) elif "的人" in ans and all_ans_tags[idj] == 'n' and all_hw_tags[idi] == 'a' \ or ("的人" in hw and all_hw_tags[idi] == 'n' and all_ans_tags[idj] == 'a'): part_of_speech_s.extend([0]*double_groups) elif "…" + hw + ";" in ans+";" and phrase_classify(en) in ["prep-phrase", "v-phrase"]: part_of_speech_s.extend([0]*double_groups) elif re.search("[()()]", hw) is None and re.search(hw+"…+[\u4e00-\u9fa5]", ans): part_of_speech_s.extend([0] * double_groups) elif all_ans_tags[idj] == 'v' and re.search("^使", ans): row_groups.append((hw, re.sub("^使", "", ans))) double_groups += 1 part_of_speech_s.extend([1] * double_groups) else: hw_pos = [[all_hw_tags[idi]]] # hw in dict_tags:hw_pos = [pos([hw])] ans_pos = [[all_ans_tags[idj]]] if hw not in dict_tags and hw in all_tags_with_str: # if hw not in all_tags_with_str: # a_pos = pos(hw) # all_tags_with_str[hw] = a_pos hw_pos.extend(all_tags_with_str[hw]) if ans not in dict_tags and ans in all_tags_with_str: # ans_pos = [pos([ans])] # ans_pos = [[all_ans_tags[idj]]] # if ans not in all_tags_with_str: # b_pos = pos(ans) # all_tags_with_str[ans] = b_pos ans_pos.extend(all_tags_with_str[ans]) # print(hw_pos, ans_pos) if all([True if i not in sum(ans_pos, []) else False for i in sum(hw_pos, [])]): # 词性不同,如a和v,逻辑待定 part_of_speech_s.extend([0]*double_groups) else: part_of_speech_s.extend([1]*double_groups) if not row_groups_locate: row_groups_locate.append([0, double_groups]) else: bef = row_groups_locate[-1][1] row_groups_locate.append([bef, bef + double_groups]) double_groups_locate[idi] = row_groups_locate # print("row_groups:", row_groups) num_per_row.append(len(row_groups)) all_groups.extend(row_groups) # 开始是每个hw与多个参考答案为一组算相似度 # simi_score = sts(row_groups) # print("part_of_speech_s:", part_of_speech_s) # print("simi_score[{}]:".format(row_groups), simi_score) # scores_byrow.append(simi_score) # if 0 in part_of_speech_s: # scores_byrow[-1] = list(map(lambda x, y: x * y, scores_byrow[-1], part_of_speech_s)) # 所有组合一起算相似度 print("词性标注所花时间:", time.time()-st5) st1 = time.time() print(all_groups) # print(part_of_speech_s) if all_groups: simi_score = sts(all_groups) print("99999999时间:", time.time() - st1) # print(num_per_row) # --------------局部更新相似度--------------------------- # ------------对相似度>0.9的单词对调换位置重新计算相似度------------ groups_09 = [[i, all_groups[i]] for i, j in enumerate(simi_score) if j > 0.9 and part_of_speech_s[i]] rejudge_groups = [(s[1][1], s[1][0]) for s in groups_09] print("rejudge_groups>0.9:", rejudge_groups) if groups_09 and len(rejudge_groups) <= 3 and len(re.findall("[a-zA-Z'\-\(\)()]+", en.strip())) == 1: simi_score_reversed = sts(rejudge_groups) simi_score_le_09 = [i for i, si in enumerate(simi_score_reversed) if si < 0.6] if simi_score_le_09: for i in simi_score_le_09: simi_score[groups_09[i][0]] = simi_score_reversed[i] # -------------对相似度>0.8的词对调换位置重新计算相似度,最后选取最大的------------- elif not groups_09: groups_08 = [[i, all_groups[i]] for i, j in enumerate(simi_score) if j > 0.8 and part_of_speech_s[i]] rejudge_groups = [(s[1][1], s[1][0]) for s in groups_08] print("rejudge_groups>0.8:", rejudge_groups) simi_score_reversed = sts(rejudge_groups) simi_score_ge_08 = [i for i, si in enumerate(simi_score_reversed) if si > 0.9] if simi_score_ge_08: for i in simi_score_ge_08: simi_score[groups_08[i][0]] = simi_score_reversed[i] # ------------------------------------------------------------------------------ idx = 0 for n in num_per_row: score_a_row = simi_score[idx: idx + n] if 0 in part_of_speech_s[idx: idx + n]: score_a_row = list(map(lambda x, y: x * y, score_a_row, part_of_speech_s[idx: idx + n])) idx += n scores_byrow.append(score_a_row) # ---------------按原来的group—shape还原------------- if not is_token and len(double_groups_locate) == len(scores_byrow) and all( [True if k1 < len(scores_byrow) and v1[-1][1] == len(scores_byrow[k1]) else False for k1, v1 in double_groups_locate.items()]): for idn, row in enumerate(list(double_groups_locate.values())): new_row = [] score_row = scores_byrow[idn] if any([True for r in row if r[1] - r[0] > 1]): for r in row: if r[1] - r[0] > 1: new_row.append(max(score_row[r[0]: r[1]])) else: new_row.append(score_row[r[0]]) else: new_row = score_row scores_byrow_rawshape.append(new_row) # print(scores_byrow) return scores_byrow, scores_byrow_rawshape def pos_tag_han(w, flag="by_str"): """ 词性标注 :return: """ # print(hanlp.pretrained.pos.ALL) # 打印所有的训练数据 if flag == "by_str": return w, pos(w) if type(w) == str: return pos([w])[0] elif type(w) == list: return pos(w) if __name__ == '__main__': import time t1 = time.time() # print(pos_tag_han(['环境友好的'], flag="by_list")) simi_score = sts([('画廊', '美术画廊'), ('美术画廊', '画廊')]) print(simi_score) # '使物质分解', '破裂', '分解', '消除', '损坏', '机器或车辆出毛病', '讨论、关系或系统失败' # '破除', '感情失控(痛哭起来)', '感情失控', '感情失控痛哭起来', # han_similarity(['看图猜一电影名', '看图猜电影'],['北京到上海的动车票', '上海到北京的动车票']) # a = han_similarity(['抛弃'],['破除','捣毁', '拆除', '破除障碍或偏见', '破除(障碍或偏见)']) # # # a = sts([('特别地','尤其'),('特别地','特别')]) # print(a) # a = han_similarity("effective measure", ['有效的措施'], ['有效措施'], is_token=0) # print(a) # # # print(time.time()-t1) a = pos("草拟") # print(pos_tag_han("范畴")) # a.append(pos_tag("明白")) print(a) # print(pos_tag(["明白"])) # print(pos_tag("明白")) # print(pos("知道")) # print(pos("明白")) # a = pos(["必"]) # b = pos(["分解"]) # c = pos(["不是"],) # print(c) # b1 = [['j'], ['v'], ['v'], ['v']] # b2 = [['j'], ['v'], ['n']] # print(all([True if i not in b1 else False for i in b2])) # aa = ['“垃圾', '废弃物无用的东西', '乌七八糟的东西', '垃圾', '废物'] # pos_tag_han(aa) # rrr = pos_tag_han("不想要的", flag="by_list") # print(rrr)