123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304 |
- #!/usr/bin/env/python
- # -*- coding:utf-8 -*-
- """
- 文本语义相似度
- sts([
- ('看图猜一电影名', '看图猜电影'),
- ('无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'),
- ('北京到上海的动车票', '上海到北京的动车票'),
- ])
- """
- import re
- import time
- from Utils.util import phrase_classify
- from concurrent.futures import ThreadPoolExecutor
- from my_config import sts, pos, dict_tags
- def batch_tag(hw_list, ans_given):
- """
- 批量贴标签
- :param hw_list:
- :param ans_given:
- :return:
- """
- all_tags_with_str = {}
- # 多进程获取词性标注
- hw_l = [len(i) for i in hw_list]
- ans_given_choosed = []
- if len(ans_given) > 80: # 选择部分进行词性标注
- ans_given_choosedbylen = [a for a in ans_given if len(a) in hw_l]
- num = 80 - len(ans_given_choosedbylen)
- ans_given_choosed.extend(ans_given_choosedbylen[:80])
- if num > 0:
- ans_given_choosed.extend(list(set(ans_given) - set(ans_given_choosedbylen))[:num])
- else:
- ans_given_choosed = ans_given
- all_ch = hw_list.copy()
- all_ch.extend(ans_given_choosed)
- all_ch = list(set(all_ch))
- print("all_ch:::", all_ch)
- with ThreadPoolExecutor(max_workers=3) as executor:
- for future in executor.map(pos_tag_han, all_ch):
- all_tags_with_str[future[0]] = future[1]
- return all_tags_with_str
- def groups_choose(hw_list, ans_given):
- """
- 相似度计算个数限定
- :param hw_list:
- :param ans_given:
- :return:
- """
- if len(hw_list) * len(ans_given) > 100:
- length_hw = [len(h.strip()) for h in hw_list]
- new_anss = [a for a in ans_given if len(a) in length_hw]
- anss_rest = [a for a in ans_given if len(a) not in length_hw]
- if len(hw_list) * len(new_anss) <= 90:
- new_anss2 = [a for a in anss_rest if (len(a) > 2 and len(a)-1 in length_hw) or len(a)+1 in length_hw]
- if len(hw_list) * (len(new_anss) + len(new_anss2)) > 100:
- new_anss.extend(new_anss2[:int(100/len(hw_list))-len(new_anss)])
- elif len(hw_list) * (len(new_anss) + len(new_anss2)) < 50:
- new_anss.extend(anss_rest[:int(100 / len(hw_list)) - len(new_anss)])
- elif len(hw_list) * len(new_anss) > 100:
- new_anss = new_anss[:int(100/len(hw_list))]
- return new_anss
- return ans_given
- def han_similarity(en, hw_list, ans_given, cutted_words, is_token=0):
- """
- hanlp 短文本相似度
- 含"的"时,判断一下词性
- is_token:表示hw_list是否为分词的结果
- :return:
- """
- scores_byrow = []
- scores_byrow_rawshape = []
- num_per_row = []
- all_groups = []
- part_of_speech_s = []
- double_groups_locate = {}
- # ----------------先限制下相似度计算个数:<=100----------
- ans_given = groups_choose(hw_list, ans_given)
- # -------------------------------------------------------
- st5 = time.time()
- # 一起标注
- all_hw_tags = pos_tag_han(hw_list, flag="by_list")
- all_ans_tags = pos_tag_han(ans_given, flag="by_list")
- # print(all_hw_tags, ans_given, all_ans_tags)
- # 多进程获取词性标注
- all_tags_with_str = batch_tag(hw_list, ans_given)
- print("111111词性标注时间::", time.time() - st5)
- print("m*n:::", len(hw_list), hw_list, len(ans_given))
- # ---------------------------------------------
- for idi, hw in enumerate(hw_list):
- # print(idi, len(hw_list))
- print("is_token:", is_token)
- if is_token and all_hw_tags[idi] in ["d", "u", "ud", "c"]: # 分词时暂不考虑副词
- continue
- hw = re.sub(r"^([\u4e00-\u9fa5])\1", r"\1", hw) # 例:速率率
- hw = re.sub(r"^([使得有我令对让向将和与]+)…+$", r"\1", hw)
- row_groups = []
- # part_of_speech_s = []
- row_groups_locate = []
- for idj, ans in enumerate(ans_given):
- double_groups = 1
- # print(ans, pos_tag_han(ans), hw, pos_tag_han(hw))
- row_groups.append((hw, ans))
- if cutted_words and len(en.split(" ")) > 1:
- ans0, hw0 = ans, hw
- is_repl = 0
- for j in cutted_words[::-1]: # hw的分词结果
- if j+";" in ans + ";":
- ans0 = re.sub(j+"$", "", ans0)
- hw0 = re.sub(j+"$", "", hw0)
- is_repl = 1
- if is_repl and ans0 and hw0:
- row_groups.append((hw0, ans0))
- double_groups += 1
- if "…" in hw or "…" in ans:
- row_groups.append((hw.replace("…", ""), ans.replace("…", "")))
- double_groups += 1
- if re.sub(r"(.+)的$", r"\1", ans) == hw and all_hw_tags[idi] != "a":
- part_of_speech_s.append(0)
- if re.search(r"(.+)的$", ans) and re.search(r"(.+)的$", hw) is None or (
- re.search(r"(.+)的$", hw) and re.search(r"(.+)的$", ans) is None and all_ans_tags[idj] != 'a'):
- part_of_speech_s.extend([0]*double_groups)
- elif re.search(r".{2,}地$", ans):
- # if pos_tag_han(ans) in ["d", "v"] and pos_tag_han(hw) not in ["d", "v"]:
- if all_ans_tags[idj] in ["d", "v"] and all_hw_tags[idi] not in ["d", "v"]:
- part_of_speech_s.extend([0]*double_groups)
- elif re.search(r"(.+)地$", ans) is None and (all_hw_tags[idi] == 'a'or all_ans_tags[idj] != 'n'):
- part_of_speech_s.extend([0]*double_groups)
- elif re.search(r"(.+)的$", hw):
- part_of_speech_s.extend([0]*double_groups)
- else:
- part_of_speech_s.extend([1]*double_groups)
- elif ans[-1] in ["地", "的"] and hw[-1] in ["地", "的"] and ans[-1] != hw[-1]:
- part_of_speech_s.extend([0]*double_groups)
- elif ans[-1] not in ["地", "的"] and hw[-1] in ["地", "的"] and ans == hw[:-1]:
- part_of_speech_s.extend([0]*double_groups)
- elif "的人" in ans and all_ans_tags[idj] == 'n' and all_hw_tags[idi] == 'a' \
- or ("的人" in hw and all_hw_tags[idi] == 'n' and all_ans_tags[idj] == 'a'):
- part_of_speech_s.extend([0]*double_groups)
- elif "…" + hw + ";" in ans+";" and phrase_classify(en) in ["prep-phrase", "v-phrase"]:
- part_of_speech_s.extend([0]*double_groups)
- elif re.search("[()()]", hw) is None and re.search(hw+"…+[\u4e00-\u9fa5]", ans):
- part_of_speech_s.extend([0] * double_groups)
- elif all_ans_tags[idj] == 'v' and re.search("^使", ans):
- row_groups.append((hw, re.sub("^使", "", ans)))
- double_groups += 1
- part_of_speech_s.extend([1] * double_groups)
- else:
- hw_pos = [[all_hw_tags[idi]]] # hw in dict_tags:hw_pos = [pos([hw])]
- ans_pos = [[all_ans_tags[idj]]]
- if hw not in dict_tags and hw in all_tags_with_str:
- # if hw not in all_tags_with_str:
- # a_pos = pos(hw)
- # all_tags_with_str[hw] = a_pos
- hw_pos.extend(all_tags_with_str[hw])
- if ans not in dict_tags and ans in all_tags_with_str:
- # ans_pos = [pos([ans])]
- # ans_pos = [[all_ans_tags[idj]]]
- # if ans not in all_tags_with_str:
- # b_pos = pos(ans)
- # all_tags_with_str[ans] = b_pos
- ans_pos.extend(all_tags_with_str[ans])
- # print(hw_pos, ans_pos)
- if all([True if i not in sum(ans_pos, []) else False for i in sum(hw_pos, [])]): # 词性不同,如a和v,逻辑待定
- part_of_speech_s.extend([0]*double_groups)
- else:
- part_of_speech_s.extend([1]*double_groups)
- if not row_groups_locate:
- row_groups_locate.append([0, double_groups])
- else:
- bef = row_groups_locate[-1][1]
- row_groups_locate.append([bef, bef + double_groups])
- double_groups_locate[idi] = row_groups_locate
- # print("row_groups:", row_groups)
- num_per_row.append(len(row_groups))
- all_groups.extend(row_groups)
- # 开始是每个hw与多个参考答案为一组算相似度
- # simi_score = sts(row_groups)
- # print("part_of_speech_s:", part_of_speech_s)
- # print("simi_score[{}]:".format(row_groups), simi_score)
- # scores_byrow.append(simi_score)
- # if 0 in part_of_speech_s:
- # scores_byrow[-1] = list(map(lambda x, y: x * y, scores_byrow[-1], part_of_speech_s))
- # 所有组合一起算相似度
- print("词性标注所花时间:", time.time()-st5)
- st1 = time.time()
- print(all_groups)
- # print(part_of_speech_s)
- if all_groups:
- simi_score = sts(all_groups)
- print("99999999时间:", time.time() - st1)
- # print(num_per_row)
- # --------------局部更新相似度---------------------------
- # ------------对相似度>0.9的单词对调换位置重新计算相似度------------
- groups_09 = [[i, all_groups[i]] for i, j in enumerate(simi_score) if j > 0.9 and part_of_speech_s[i]]
- rejudge_groups = [(s[1][1], s[1][0]) for s in groups_09]
- print("rejudge_groups>0.9:", rejudge_groups)
- if groups_09 and len(rejudge_groups) <= 3 and len(re.findall("[a-zA-Z'\-\(\)()]+", en.strip())) == 1:
- simi_score_reversed = sts(rejudge_groups)
- simi_score_le_09 = [i for i, si in enumerate(simi_score_reversed) if si < 0.6]
- if simi_score_le_09:
- for i in simi_score_le_09:
- simi_score[groups_09[i][0]] = simi_score_reversed[i]
- # -------------对相似度>0.8的词对调换位置重新计算相似度,最后选取最大的-------------
- elif not groups_09:
- groups_08 = [[i, all_groups[i]] for i, j in enumerate(simi_score) if j > 0.8 and part_of_speech_s[i]]
- rejudge_groups = [(s[1][1], s[1][0]) for s in groups_08]
- print("rejudge_groups>0.8:", rejudge_groups)
- simi_score_reversed = sts(rejudge_groups)
- simi_score_ge_08 = [i for i, si in enumerate(simi_score_reversed) if si > 0.9]
- if simi_score_ge_08:
- for i in simi_score_ge_08:
- simi_score[groups_08[i][0]] = simi_score_reversed[i]
- # ------------------------------------------------------------------------------
- idx = 0
- for n in num_per_row:
- score_a_row = simi_score[idx: idx + n]
- if 0 in part_of_speech_s[idx: idx + n]:
- score_a_row = list(map(lambda x, y: x * y, score_a_row, part_of_speech_s[idx: idx + n]))
- idx += n
- scores_byrow.append(score_a_row)
- # ---------------按原来的group—shape还原-------------
- if not is_token and len(double_groups_locate) == len(scores_byrow) and all(
- [True if k1 < len(scores_byrow) and v1[-1][1] == len(scores_byrow[k1]) else False for k1, v1
- in double_groups_locate.items()]):
- for idn, row in enumerate(list(double_groups_locate.values())):
- new_row = []
- score_row = scores_byrow[idn]
- if any([True for r in row if r[1] - r[0] > 1]):
- for r in row:
- if r[1] - r[0] > 1:
- new_row.append(max(score_row[r[0]: r[1]]))
- else:
- new_row.append(score_row[r[0]])
- else:
- new_row = score_row
- scores_byrow_rawshape.append(new_row)
- # print(scores_byrow)
- return scores_byrow, scores_byrow_rawshape
- def pos_tag_han(w, flag="by_str"):
- """
- 词性标注
- :return:
- """
- # print(hanlp.pretrained.pos.ALL) # 打印所有的训练数据
- if flag == "by_str":
- return w, pos(w)
- if type(w) == str:
- return pos([w])[0]
- elif type(w) == list:
- return pos(w)
- if __name__ == '__main__':
- import time
- t1 = time.time()
- # print(pos_tag_han(['环境友好的'], flag="by_list"))
- simi_score = sts([('画廊', '美术画廊'), ('美术画廊', '画廊')])
- print(simi_score)
- # '使物质分解', '破裂', '分解', '消除', '损坏', '机器或车辆出毛病', '讨论、关系或系统失败'
- # '破除', '感情失控(痛哭起来)', '感情失控', '感情失控痛哭起来',
- # han_similarity(['看图猜一电影名', '看图猜电影'],['北京到上海的动车票', '上海到北京的动车票'])
- # a = han_similarity(['抛弃'],['破除','捣毁', '拆除', '破除障碍或偏见', '破除(障碍或偏见)'])
- # # # a = sts([('特别地','尤其'),('特别地','特别')])
- # print(a)
- # a = han_similarity("effective measure", ['有效的措施'], ['有效措施'], is_token=0)
- # print(a)
- #
- # # print(time.time()-t1)
- a = pos("草拟")
- # print(pos_tag_han("范畴"))
- # a.append(pos_tag("明白"))
- print(a)
- # print(pos_tag(["明白"]))
- # print(pos_tag("明白"))
- # print(pos("知道"))
- # print(pos("明白"))
- # a = pos(["必"])
- # b = pos(["分解"])
- # c = pos(["不是"],)
- # print(c)
- # b1 = [['j'], ['v'], ['v'], ['v']]
- # b2 = [['j'], ['v'], ['n']]
- # print(all([True if i not in b1 else False for i in b2]))
- # aa = ['“垃圾', '废弃物无用的东西', '乌七八糟的东西', '垃圾', '废物']
- # pos_tag_han(aa)
- # rrr = pos_tag_han("不想要的", flag="by_list")
- # print(rrr)
|