123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666 |
- # -*- coding: utf-8 -*-
- # import re
- # import time
- import json
- import my_config
- import func_timeout
- # import numpy as np
- from sklearn.metrics.pairwise import cosine_similarity
- from Final_word_Similarity.Hybrid_Sim import HybridSim
- from sentence_transformers import util
- # from item_embedding.chinese_emb import item2emb_cn
- from item_embedding.all_lang_emb import item2emb_all
- from Words.Dicts import errmean_words
- from full_wash import ChWash
- from Words.Phrase_dict import errmean_en_dict, phrases_dict_tk
- from basic_logic import ItemInit, get_mean_in_dict
- from Utils.wrong_written_judgement import err_mean_judge, KM_cidian_check, youdao_cidian_check
- from Utils.han_semantic_similarity import han_similarity, pos_tag_han
- from Utils.util import *
- from Words.Dicts import single_mean_words, fixed_mean_words_dict
- from Words.Words_classify import mean_fixed_words
- from Words.ch2en_dict import ch_to_en_online
- from Words.word_dict_from_textbook import word2mean_high, word2mean_junior
- from Words.syn_antonyms import syn_km, syn_ft, syn_bing
- from Words.phrases_syn_antonyms import phrase_syn_bing, phrase_syn_km
- from Utils.translator import KM_ch2en, haici_zh2en, ch2en_baidu
- from concurrent.futures import ThreadPoolExecutor
- from func_timeout import func_set_timeout
- # import gc
- # from func_timeout import func_set_timeout
- # logger = my_config.myLog(__name__, log_cate="e2cc_log").getlog()
- ch2en_logger = my_config.simpLog(__name__, log_cate="ch2en_online_log").getlog()
- """
- 存在错误类型:有多个手写意思,其中一个ocr错误,这算错还是对呢---62df443d87b408c08c51752c
- 规范作答测试:写错要全部涂黑
- """
- def bert_similarity(vc_model):
- """
- 计算bert模型输出的两个向量间的相似度,适用于句向量
- vc_model:词向量或句向量,词向量效果不佳
- :return:
- """
- # ss = [[1, 0, 0, 0], [1, 0, 0, 0]] # numpy.array(ss)
- two_vc = np.split(vc_model, 2, axis=0)
- res = cosine_similarity(two_vc[0], two_vc[1])
- return res
- class MainJudge(ItemInit):
- # @staticmethod
- def hw_ch2en(self, mean_hw):
- """
- 作答数据转英文,调接口
- :param mean_hw:
- :return:
- """
- ch2en_online = []
- def km_ch2en(s):
- try:
- ch2en1 = KM_ch2en(s)
- ch2en_online.extend(ch2en1)
- # del ch2en1
- # gc.collect()
- except:
- pass
- def hc_ch2en(s):
- try:
- ch2en2 = haici_zh2en(s)
- ch2en_online.extend(ch2en2)
- # del ch2en2
- # gc.collect()
- except:
- pass
- def bd_ch2en(s):
- try:
- ch2en3 = ch2en_baidu(s)
- ch2en_online.extend(ch2en3)
- # del ch2en3
- # gc.collect()
- except:
- pass
- # -------线程1-----------
- # # IO密集型,用多线程
- # threads = [Thread(target=km_ch2en, args=(mean_hw,)), Thread(target=hc_ch2en, args=(mean_hw,)),
- # Thread(target=bd_ch2en, args=(mean_hw,))]
- # [t.start() for t in threads]
- # [t.join() for t in threads]
- # -------线程2-----------
- all_func = [hc_ch2en, bd_ch2en, km_ch2en]
- if self.need_ch2en_hw_num >= 3:
- all_func = [hc_ch2en]
- elif self.need_ch2en_hw_num >= 2:
- all_func = [hc_ch2en, bd_ch2en]
- if len(all_func) == 1:
- all_func[0](mean_hw)
- else:
- with ThreadPoolExecutor(max_workers=len(all_func)) as executor:
- for func in all_func:
- executor.submit(func, mean_hw)
- ch2en_online_washed = []
- if ch2en_online:
- for c in set(ch2en_online):
- c = c.replace("'", "'")
- new_c = [c]
- if len(re.findall("[((].+?[))]", c)) == 1:
- c_1 = re.split("[((](.+?)[))]", c)
- if c_1[1] in ["s", "es", "ing"]:
- new_c = [c_1[0], c_1[0]+c_1[1]]
- ch2en_online_washed.extend(new_c)
- return mean_hw, ch2en_online_washed
- def synonyms_cluster_judge(self, en_ptag):
- """
- 同义聚类、反义聚类综合判断:1、根据中文找到其英文,再查英文的近反义词
- 2、根据英文的近反义词查中文意思再对比
- :return:
- """
- # 2+、教材义的单词判断
- if self.word in single_mean_words or self.word in fixed_mean_words_dict \
- or self.word in sum(list(mean_fixed_words.values()), []):
- self.is_word_with_fixed_mean = 1
- fixed_mean = []
- if self.word in word2mean_high:
- fixed_mean.append(list(word2mean_high[self.word].values()))
- if self.word in word2mean_junior:
- fixed_mean.append([word2mean_junior[self.word]])
- if self.word in fixed_mean_words_dict:
- fixed_mean.append([fixed_mean_words_dict[self.word].replace("\n", ";")])
- fixed_mean.append([self.fuller_means_simpwashed])
- # print(3333333333, ";".join(sum(fixed_mean, [])))
- fixed_mean_list = ChWash(self.word, ";".join(sum(fixed_mean, []))).zh_full_wash(source="hyk")
- fixed_mean_list = means_split(";".join(fixed_mean_list)) # 切分
- if all([True if mean_hw in fixed_mean_list else False for mean_hw in self.mean_hw_list]):
- return 1
- else:
- return 0
- else: # 近义反义判断
- synonyms, antonyms = [], []
- if self.word in syn_km:
- synonyms.extend(sum(list(syn_km[self.word]["synonyms"].values()), []))
- antonyms.extend(sum(list(syn_km[self.word]["antonyms"].values()), []))
- if self.word in syn_ft:
- synonyms.extend(syn_ft[self.word]["synonyms"])
- antonyms.extend(syn_ft[self.word]["antonyms"])
- if self.word in syn_bing:
- synonyms.extend(syn_bing[self.word]["synonyms"])
- antonyms.extend(syn_bing[self.word]["antonyms"])
- if self.word in phrase_syn_km:
- synonyms.extend(sum(list(phrase_syn_km[self.word]["synonyms"].values()), []))
- antonyms.extend(sum(list(phrase_syn_km[self.word]["antonyms"].values()), []))
- if self.word in phrase_syn_bing:
- synonyms.extend(phrase_syn_bing[self.word]["synonyms"])
- antonyms.extend(phrase_syn_bing[self.word]["antonyms"])
- synonyms = list([s.replace("'", "'") for s in set(synonyms)])
- antonyms = list([a.replace("'", "'") for a in set(antonyms)])
- # print("近义词:", synonyms)
- # print("反义词:", antonyms)
- # 存在词的近义之间有差别:hw_en的词虽在近义词表中但两者不是同一个层意思
- is_all_synonyms = 1
- hw2en_equal_word = [] # hw2en与word相同的hw
- no_synonyms = 0 # 无近义词
- is_all_hw_en_exist = [] # 是否所有的hw都存在en
- need_ch2en_online = [] # 需要在线获取en的hw
- # ch2en_dict_online = {} # 在线获取的hw-en词典
- # ----------统计需要在线获取en的中文作答----------
- time1 = time.time()
- for mean_hw in self.mean_hw_list:
- # len(self.word.split(" ")) == 1 and
- if "(" not in mean_hw and "(" not in mean_hw: # 带括号的不考虑
- need_ch2en_online.append(mean_hw)
- # ----------在线获取en----------
- need_ch2en_online = list(set(need_ch2en_online))
- need_ch2en_online = [ch for ch in need_ch2en_online if ch not in ch_to_en_online]
- ch2en_dict_online = {ch: ch_to_en_online[ch] for ch in need_ch2en_online if ch in ch_to_en_online}
- if need_ch2en_online:
- max_workers = 1
- if len(need_ch2en_online) >= 6:
- max_workers = 6
- elif len(need_ch2en_online) > 1:
- max_workers = len(need_ch2en_online)
- self.need_ch2en_hw_num = len(need_ch2en_online) # 需要在线获取en的hw的个数
- if max_workers > 1:
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
- for future in executor.map(self.hw_ch2en, need_ch2en_online):
- ch2en_dict_online[future[0]] = future[1]
- if future[1]: # 将在线获取得ch2en先保存,每天更新一次
- ch2en_logger.info(json.dumps({"{}".format(future[0]): future[1]}, ensure_ascii=False))
- else: # 需要在线获取en的hw的个数只有1个
- hw, hw2en = self.hw_ch2en(need_ch2en_online[0])
- if hw2en:
- ch2en_dict_online[hw] = hw2en
- ch2en_logger.info(json.dumps({"{}".format(hw): hw2en}, ensure_ascii=False))
- # print("在线获取en的时间:", time.time() - time1)
- # -----------------根据hw的en与近反义词进行判断---------------------
- for mean_hw in self.mean_hw_list:
- dict_ch2en = my_config.dict_ch2en
- hw_en = []
- if mean_hw in dict_ch2en: # 从整理的ch2en获取
- hw_en = dict_ch2en[mean_hw]
- # print("整理的hw_en:", mean_hw, hw_en)
- if mean_hw in ch2en_dict_online and ch2en_dict_online[mean_hw]:
- # print("ch2en_online:", ch2en_dict_online[mean_hw])
- hw_en.extend(ch2en_dict_online[mean_hw])
- # print("all--hw_en:", hw_en) # 近义词: ['southeast', 'southeasterly']
- if hw_en:
- hw_en = list(set(map(lambda x: x.lower(), hw_en))) # 变小写
- # print("变形hw_en:", hw_en)
- if any([True for en in hw_en if en in antonyms and en not in synonyms]): # 反义词
- return 0
- if any([True for en in hw_en if en == self.word]): # hw_en与题干相等
- if any([re.search(mean_hw+"[的地]$", u) for u in self.union_means]): # 是错,可能也是对(如果有非adj的近义的话)
- return -1
- if not self.mean_hw_cutted:
- continue
- else: # 多个mean_hw时,且被截取后,只要有一个存在就可以
- return 1
- # synonyms.append(self.word)
- if en_ptag == "n-phrase" and mean_hw not in self.mean_hw_cutted and \
- any([True for en in hw_en if en != self.word and re.search("^"+en + "[a-z']{,4} [a-z]{3,}$", self.word)]):
- return 0
- # print("hw_en:::", hw_en)
- if any([True for en in hw_en if en != self.word and re.search("^"+en + "[a-z']{,4}$", self.word)]):
- # steady progress 与steady progression # utter与utterly
- equal_en = [en for en in hw_en if en != self.word and re.search("^"+en + "[a-z']{,4}$", self.word)
- and en + 'ly' == self.word and en_ptag in ["RB", "ADV"]]
- if equal_en:
- return 0
- continue
- if any([True for en in hw_en if en == self.word.replace(" ", "") and
- self.word.replace(" ", "") not in ["breakdown"]]): # 对题干去空格与en同
- hw2en_equal_word.append(mean_hw)
- elif not self.mean_hw_cutted and any([True for en in hw_en if en != self.word
- and re.search("\s"+en+"|"+en + "\s", self.word)
- and en_ptag == "n-phrase"]): # en in self.word 名词短语,作答只译了一半的情况
- return 0
- elif not synonyms:
- no_synonyms = 1
- break
- elif all([True if en not in synonyms else False for en in hw_en]): # hw的en都不在近义词中
- # 是否限定形容词、副词走近义不满足时继续走相似度匹配!!!!
- if not self.is_phrase: # 单词
- if re.search("[\u4e00-\u9fa5]+的[\u4e00-\u9fa5]", mean_hw): # 例:virtue(高尚的品德)
- return -1
- if len(mean_hw) <= 2:
- is_all_synonyms = 0 # 存在不是近义
- else:
- if not self.sbert_row_maxsimi_res:
- embs = item2emb_all(self.mean_hw_list + list(self.union_means))
- self.sbert_simi_res = util.cos_sim(embs[0:len(self.mean_hw_list)],
- embs[len(self.mean_hw_list):])
- self.sbert_row_maxsimi_res = [max(s) for s in self.sbert_simi_res.tolist()]
- # print("近义词聚类中new_row_maxsimi_res:", self.sbert_row_maxsimi_res)
- if max(self.sbert_row_maxsimi_res) >= 0.9:
- return -1
- else:
- is_all_synonyms = 0 # 存在不是近义
- else: # 短语继续走后面的相似度判断
- return -1
- else:
- # 存在hw的en虽然近义词中,但在哪个释义上属于近义是不一样的
- syns = [en for en in hw_en if en in synonyms] # 在synonyms中的en
- # print("在synonyms中的en:", syns)
- for syn in syns:
- _, syn_mean = get_mean_in_dict(syn.strip(), mod="all")
- # syn_mean = "" # en的中文释义
- # if syn in words_dict:
- # syn_mean += words_dict[syn]
- # if syn in more_words_dict:
- # syn_mean += ";" + more_words_dict[syn]
- # if syn in phrases_dict_tk:
- # syn_mean += ";" + phrases_dict_tk[syn]
- # print(11111,syn_mean)
- # mean_hw在syn_mean中可能属于en的释义也可能不是
- if re.search("(?<![\u4e00-\u9fa5])"+mean_hw+"(?![\u4e00-\u9fa5])", syn_mean):
- return -1
- # 根据词性筛选(存在有的近义词是错的)
- pos_mean = re.findall("(?<!=[a-z])([a-z]+)\.", syn_mean)
- pos_fuller_mean = re.findall("(?<!=[a-z])([a-z]+)\.", self.fuller_means_nowashed)
- if re.search("n\.\s*"+mean_hw, syn_mean) and pos_fuller_mean and \
- all([True if a[0] in ["v", "V"] else False for a in pos_fuller_mean]):
- return 0
- if (re.search("的$", mean_hw) is None and pos_tag_han(mean_hw, flag="by_list") != 'a') and \
- (phrase_classify(syn) == "JJ" or (pos_mean and
- all([True if a[0] == "adj" else False for a in pos_mean]))):
- return 0
- if not self.sbert_row_maxsimi_res:
- embs = item2emb_all(self.mean_hw_list + list(self.union_means))
- self.sbert_simi_res = util.cos_sim(embs[0:len(self.mean_hw_list)], embs[len(self.mean_hw_list):])
- self.sbert_row_maxsimi_res = [max(s) for s in self.sbert_simi_res.tolist()]
- # print("近义词聚类中new_row_maxsimi_res:", self.sbert_row_maxsimi_res)
- if max(self.sbert_row_maxsimi_res) < 0.6:
- return 0
- else:
- return -1
- # # 还需要判断一下,存在近义词聚类不全的情况
- # en_means_text = ""
- # for en in hw_en:
- # en_means_text += get_mean_in_dict(en, mod="all")
- # all_means = means_split(en_means_text) # 切分
- # all_means, _ = wash(all_means, pos="standard")
- # # if any([True for j in all_means if j in self.ans_list or j in self.fuller_means_simpwashed]):
- # # print(all_means)
- # # print(self.fuller_means_simpwashed, other_mean)
- # # print([j for j in all_means if j in self.ans_list or j in self.fuller_means_simpwashed])
- # if any([True for j in all_means if j in self.ans_list or j in other_mean]):
- # hw2en_equal_word.append(mean_hw)
- # else:
- # is_all_synonyms = 0
- # break
- else:
- is_all_hw_en_exist.append(-1)
- # return -1
- if hw2en_equal_word:
- self.mean_hw_list = [j for j in self.mean_hw_list if j not in hw2en_equal_word]
- if not is_all_synonyms:
- return 0
- elif no_synonyms:
- return -1
- elif -1 in is_all_hw_en_exist:
- return -1
- else:
- return 1
- def cilin_similarity_judge(self):
- """
- 词林相似度判断
- :return:
- """
- scores = []
- t3 = time.time()
- hb = HybridSim()
- for idi, mean_hw in enumerate(self.mean_hw_list):
- if re.search("[(()),,、]", mean_hw):
- continue
- one_scores = []
- for idj, ans in enumerate(self.ans_list):
- # print(mean_hw, " vs ", ans)
- if re.search("[(()),,、]", ans):
- continue
- cilin_score = hb.get_Final_sim(mean_hw, ans)
- # print("cilin_score:", cilin_score)
- if cilin_score >= 0.9 or cilin_score == -2: # -1需不需要判断一下????
- # 用han相似度判断再一下
- if not self.simires_han_rawshape:
- self.han_simi_res, self.simires_han_rawshape = han_similarity(self.word, self.mean_hw_list,
- self.ans_list, self.cutted_words)
- # mean_hw = re.sub(r"^(.{2,})[地了的得等]$", r"\1", mean_hw)
- # ans = re.sub(r"^(.{2,})[地了的得等]$", r"\1", ans)
- # han_score = han_similarity(self.word, [mean_hw], [ans], self.cutted_words)[0][0]
- han_score = self.han_simi_res[idi][idj]
- if han_score > 0.4:
- if cilin_score == -2: # 词林中没找到这个词
- one_scores.append(han_score)
- else:
- one_scores.append(cilin_score)
- elif han_score == 0.0:
- # return 0
- one_scores.append(0.0)
- else:
- one_scores.append(han_score)
- else:
- one_scores.append(cilin_score)
- if one_scores:
- if max(one_scores) == 0.0 and min(one_scores) != -1:
- return 0
- scores.append(max(one_scores))
- # print("\n4、同义词词林相似度得分:", scores, "时间:", time.time() - t3)
- if all([True if s >= 0.9 else False for s in scores]): # 相似度都很高时
- return 1
- else:
- # 将相似度为0.9以上的剔除,在后面逻辑中无需比较
- idx_09 = [i for i, s in enumerate(scores) if s >= 0.9]
- if idx_09:
- self.mean_hw_list = [mean for i, mean in enumerate(self.mean_hw_list) if i not in idx_09]
- return scores
- def score_res(self, general_score):
- if self.mod in ["book", "all"]:
- if self.score_onbook == -1:
- self.score_onbook = general_score
- if self.mod == "all":
- return general_score, self.score_onbook
- elif self.mod == "book":
- return self.score_onbook
- else:
- return general_score
- # @func_set_timeout(1.2)
- def judge_err_wrriten(self):
- """
- 错别字判断得分
- :return:
- """
- # time2 = time.time()
- noerr_res, char_similar_score, zhword_judged = err_mean_judge(self.mean_hw_list, self.ans_list)
- if noerr_res < 1:
- # print("纠错所花时间::", time.time() - time2)
- if not KM_cidian_check(zhword_judged):
- # 数据库搜索判断
- if not youdao_cidian_check(zhword_judged):
- self.score_on_err_wrriten = 0
- else:
- noerr_res = 1 # 表示没有错误
- # print("纠错所花时间2::", time.time() - time2)
- else:
- noerr_res = 1 # 表示没有错误
- return noerr_res
- def emb_similar(self, en, hw_list, ans_given, is_token=0, noerr=1, flag=0):
- """
- 向量语义相似度
- is_token:表示手写答案是否为分词后的
- flag:为1表示传cutted_words
- :return:
- """
- score = 0
- # sentences-bert模型:适用长文本
- # embs = item2emb_cn(hw_list + ans_given)
- # simi_res = util.cos_sim(embs[0:len(hw_list)], embs[len(hw_list):])
- # row_maxsimi_res = [max(s) for s in simi_res.tolist()]
- # hanlp 语义相似度模型:适用短文本
- st7 = time.time()
- if flag == 1:
- self.han_simi_res, _ = han_similarity(en, hw_list, ans_given, self.cutted_words, is_token)
- else:
- self.han_simi_res, _ = han_similarity(en, hw_list, ans_given, [], is_token)
- # print("simi_res:", hw_list, ans_given, self.han_simi_res)
- print("hanlp模型所花时间:", time.time() - st7)
- row_maxsimi_res = [max(s) for s in self.han_simi_res if s]
- if is_token:
- if all([True if i * noerr > 0.9 else False for i in row_maxsimi_res]):
- score = 1
- elif min(row_maxsimi_res) * noerr == 0 and hw_list[row_maxsimi_res.index(min(row_maxsimi_res))] \
- not in self.mean_hw_cutted: # not is_token时
- score = 0
- elif max(row_maxsimi_res) * noerr >= 0.895:
- st8 = time.time()
- embs = item2emb_all(hw_list + ans_given) # 和前面不一样,需单词计算
- self.sbert_simi_res = util.cos_sim(embs[0:len(hw_list)], embs[len(hw_list):]).tolist()
- self.sbert_row_maxsimi_res = [max(s) for s in self.sbert_simi_res]
- # print("sentences-bert模型:", self.sbert_simi_res)
- print("sentences-bert模型所花时间:", time.time() - st8)
- if max(self.sbert_row_maxsimi_res) < 0.45:
- score = max(self.sbert_row_maxsimi_res)
- else:
- score = 1
- for n, one_res in enumerate(self.han_simi_res): # 两模型的索引是不同步的,还需调整!!!!
- if max(one_res) > 0.9:
- maxres_idx = one_res.index(max(one_res))
- if maxres_idx < len(self.sbert_simi_res[n]) and self.sbert_simi_res[n][maxres_idx] < 0.45:
- score = 0
- break
- else:
- score = max(row_maxsimi_res) * noerr
- # print("模型语义相似度得分:{}------{}vs{},时间:{}".format(row_maxsimi_res, str(hw_list),
- # str(ans_given), time.time() - t4))
- return score
- # @func_set_timeout(6)
- def main_judge(self, again=0):
- """
- 语义等价判断算法综合:1>>根据英语短词、短语在题库中获取其意思,生成list,再比对
- 2>>词林相似度
- 3>>语义相似度
- 4>>针对2、3相似度方法的缺陷增加对中文、英文近反义的辅助判断
- 教材义批改时:答案比对为0时还需要走相似度比较;不属于教材中的单词则走广义批改
- en_word:一个英语单词或短语
- zh_mean_hw: 手写的中文翻译, (只要有一个意思答对算对吗?)
- :return:
- """
- # ------基本判断:答案比对-------
- st1 = time.time()
- score = self.simp_judge()
- print("简单判断时间:", time.time()-st1)
- if score == "0" or score == 1:
- return int(score)
- else:
- if not self.mean_hw_list:
- return 0
- # -------含有错误意思的单词单独判断---------特殊方法特殊处理-----------
- washed_word = re.sub("…+\s*$", "", self.word).strip()
- if washed_word in errmean_words and any([True for hw in self.mean_hw_list
- if re.sub("…+\s*$", "", hw).strip() in errmean_words[washed_word]]):
- return 0
- if washed_word in errmean_en_dict or self.err_mean_list:
- temp_mean_hw_list = [re.sub("(?<!=[….])(…+|\.{3,})", "…", m) for m in self.mean_hw_list]
- errmean_list = self.err_mean_list
- if washed_word in errmean_en_dict:
- errmean_list.extend(re.split("[;;,,]", errmean_en_dict[washed_word]))
- if any([True for hw in temp_mean_hw_list if re.sub("…+\s*$", "", hw).strip() in errmean_list]):
- return 0
- # --------------特殊情况处理--------------------------
- # 例:生效与使生效不同
- try:
- if any([True for m in self.mean_hw_list if re.search(r"[使令对让].{,2}" + m, ";".join(self.union_means))]):
- return 0
- if any([True for u in self.union_means if re.search(r"[使令对让].{,2}" + u, ";".join(self.mean_hw_list))
- and re.search(r"(?<=[);])[使令对让].+",
- ";" + ";".join(self.union_means)) is None]):
- return 0
- except:
- pass
- # 的 得 地
- if any([True for hw in self.mean_hw_list if re.search("[\u4e00-\u9fa5]{2,}的[\u4e00-\u9fa5]+", hw) and \
- re.sub(r"([\u4e00-\u9fa5]{2,})的([\u4e00-\u9fa5]+)", r"\1得\2", hw) in ";".join(self.union_means)
- and all([True if re.search("的(?!$)", a) is None else False for a in self.union_means])]):
- return 0
- # xxx for/of
- w1 = re.search("^(.+?) (for|of)$", self.word.strip())
- if w1 and phrase_classify(w1.group(1)) in ["n-phrase", "NN"]:
- _, part_mean = get_mean_in_dict(w1.group(1).strip(), mod="all")
- part_mean_list = means_split(part_mean)
- if any([True for hw in self.mean_hw_list if hw in part_mean_list]):
- return 0
- # 若是短语,先分析下词性(是动词短语还是adj/n短语)
- ptag = phrase_classify(self.word)
- # print("ptag:::", ptag)
- if ptag == "v-phrase" and any([True for h in self.mean_hw_list if re.search(r"(.+)的$", h)]):
- return 0
- if ptag == 'prep-phrase' and self.word.split(" ")[0] in ["in", "with", "on", "at"] \
- and any([True for h in self.mean_hw_list if re.search("的[\u4e00-\u9fa5]+[^中上下]$", h)
- and re.search("^在.+?的[\u4e00-\u9fa5]+$", h) is None]): # with reason vs …的理由
- return 0
- # 忽略的和可忽略的不同
- if re.search("ble$", self.word) and any([True for hw in self.mean_hw_list
- if re.search("可以?" + hw + "的?", self.fuller_means_simpwashed)]):
- return 0
- # 针对形容词填成了名词的情况
- if any([True for hw in self.mean_hw_list if hw + "的" in self.union_means and ptag == "JJ"]):
- return 0
- union_mean_txt = self.ans_given + ";" + self.fuller_means_simpwashed
- if any([True for hw in self.mean_hw_list if (hw[-1] == "地" and hw[:-1] + "的" in union_mean_txt and
- re.search("的(?![\u4e00-\u9fa5])", union_mean_txt) is None)
- or (hw[-1] == "的" and hw[:-1] + "地" in union_mean_txt and
- re.search("地(?![\u4e00-\u9fa5])", union_mean_txt) is None)]):
- return 0
- # -----------------近义词聚类判断--------------------
- # 需要验证一下去掉这一项判断对最后结果的影响,
- # 在教材义批改模式下属于近义词的释义不一定在教材义
- if not again or self.syn_judge_score == -2: # -2表示初始值没被修改过(针对mod=all时走两遍的情况)
- st1 = time.time()
- self.syn_judge_score = self.synonyms_cluster_judge(ptag)
- print("聚类判断时间:", time.time() - st1)
- score = self.syn_judge_score # 若again=1,则延用上一次的值
- print("同义判断得分:", score)
- if score != -1:
- return score
- else:
- if self.is_word_with_fixed_mean: # 是意思固定的一些单词
- return 0
- # -----------错别字判断,只要有一个错的就判错---------------
- st1 = time.time()
- if not again or self.noerr_res == -1: # -1表示初始值没被修改过(针对mod=all时走两遍的情况)
- self.noerr_res = self.judge_err_wrriten()
- print("判错时间:", time.time() - st1)
- print(self.score_on_err_wrriten)
- if self.score_on_err_wrriten == 0: # 若again=1,则延用上一次的值
- return 0
- noerr_res = self.noerr_res # 若again=1,则延用上一次的值
- # ------------------词林相似度比较------------------------------
- # 4、词林相似度比较时,前面答案比对过的就不要继续拿来比较了,在simp_judge中更新mean_hw_list,
- # scores = self.cilin_similarity_judge()
- # if type(scores) == int:
- # return scores
- # ----------------------模型相似度比较------------------------------
- # if not score or max(score) < 0.9 or min(scores) < 0:
- if score < 0.9:
- # 将公共尾子串去掉,如欣赏外国文学与赞美外国文学,放后面判断
- # if self.cutted_words and len(self.word.split(" ")) > 1:
- # for j in self.cutted_words[::-1]:
- # if j+";" in ";".join(self.ans_list) + ";":
- # self.ans_list = [re.sub(j+"$", "", ans) for ans in self.ans_list]
- # self.mean_hw_list = [re.sub(j+"$", "", mean) for mean in self.mean_hw_list]
- # print(111111,self.mean_hw_list)
- score = self.emb_similar(self.word, self.mean_hw_list, self.ans_list, noerr=noerr_res, flag=1) # 第一次
- # if not score or score == 1:
- if score == 1: # 此时得分为0时还需考虑more_means的情况
- self.final_score = score
- else:
- score_first = score
- if self.hw_with_brace:
- self.mean_hw_list.extend(sum(self.hw_with_brace.values(), []))
- if self.more_mean: # 再调一次相似度模型
- # print("more_means:", self.more_mean)
- score = self.emb_similar(self.word, self.mean_hw_list, self.more_mean, noerr=noerr_res) # 第二次
- if not score or score == 1:
- return score
- if score_first < 0.1 and score < 0.1:
- return 0
- if time.time() - self.stime < 3:
- if self.cutted_words:
- final_score = self.emb_similar(self.word, self.cutted_words, self.ans_list, 1, noerr=noerr_res) # 第三次
- if final_score and final_score != 1:
- more_means = list(set(self.more_mean) - set(self.ans_list))
- final_score = self.emb_similar(self.word, self.cutted_words, more_means, 1, noerr=noerr_res) # 第四次
- if final_score < 1:
- self.final_score = 0
- else:
- self.final_score = 0
- else:
- self.final_score = 1 if score > 0.8 else 0
- return self.final_score
- def __call__(self):
- """
- 根据批改模式(教材义book、广义general,all)返回判断得分
- 教材义不存在时按广义批改
- :return:
- """
- self.simp_wash()
- if not self.mean_hw_list:
- if self.mod == "all":
- return 0, 0, []
- return 0, []
- self.getmean_inbook()
- if self.mod == "all":
- self.mod = "general"
- general_score = self.main_judge()
- mean_hw_nindict = self.mean_hw_nindict
- if self.syn_judge_score in [0, 1]:
- book_score = self.syn_judge_score
- elif not self.score_on_err_wrriten:
- book_score = 0
- else:
- self.mod = "book"
- book_score = self.main_judge(again=1)
- return general_score, book_score, mean_hw_nindict
- else: # "general"/"book"
- try:
- score = self.main_judge()
- except func_timeout.exceptions.FunctionTimedOut:
- print("批改超时")
- score = self.final_score
- return score, self.mean_hw_nindict
- if __name__ == '__main__':
- import os
- # s = cosine_similarity([[1, 0, 0, 0]], [[1, 0, 0, 0]])
- st = time.time()
- s, score1 = MainJudge("differ", "不同;区别于",
- "vi.相异,不同,不一样 ",
- mod="general")() # hw,ans
- print("最后得分:", score1, s) # , score_onbook general book
- print("时间花费:", time.time() - st)
|