judge.py 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666
  1. # -*- coding: utf-8 -*-
  2. # import re
  3. # import time
  4. import json
  5. import my_config
  6. import func_timeout
  7. # import numpy as np
  8. from sklearn.metrics.pairwise import cosine_similarity
  9. from Final_word_Similarity.Hybrid_Sim import HybridSim
  10. from sentence_transformers import util
  11. # from item_embedding.chinese_emb import item2emb_cn
  12. from item_embedding.all_lang_emb import item2emb_all
  13. from Words.Dicts import errmean_words
  14. from full_wash import ChWash
  15. from Words.Phrase_dict import errmean_en_dict, phrases_dict_tk
  16. from basic_logic import ItemInit, get_mean_in_dict
  17. from Utils.wrong_written_judgement import err_mean_judge, KM_cidian_check, youdao_cidian_check
  18. from Utils.han_semantic_similarity import han_similarity, pos_tag_han
  19. from Utils.util import *
  20. from Words.Dicts import single_mean_words, fixed_mean_words_dict
  21. from Words.Words_classify import mean_fixed_words
  22. from Words.ch2en_dict import ch_to_en_online
  23. from Words.word_dict_from_textbook import word2mean_high, word2mean_junior
  24. from Words.syn_antonyms import syn_km, syn_ft, syn_bing
  25. from Words.phrases_syn_antonyms import phrase_syn_bing, phrase_syn_km
  26. from Utils.translator import KM_ch2en, haici_zh2en, ch2en_baidu
  27. from concurrent.futures import ThreadPoolExecutor
  28. from func_timeout import func_set_timeout
  29. # import gc
  30. # from func_timeout import func_set_timeout
  31. # logger = my_config.myLog(__name__, log_cate="e2cc_log").getlog()
  32. ch2en_logger = my_config.simpLog(__name__, log_cate="ch2en_online_log").getlog()
  33. """
  34. 存在错误类型:有多个手写意思,其中一个ocr错误,这算错还是对呢---62df443d87b408c08c51752c
  35. 规范作答测试:写错要全部涂黑
  36. """
  37. def bert_similarity(vc_model):
  38. """
  39. 计算bert模型输出的两个向量间的相似度,适用于句向量
  40. vc_model:词向量或句向量,词向量效果不佳
  41. :return:
  42. """
  43. # ss = [[1, 0, 0, 0], [1, 0, 0, 0]] # numpy.array(ss)
  44. two_vc = np.split(vc_model, 2, axis=0)
  45. res = cosine_similarity(two_vc[0], two_vc[1])
  46. return res
  47. class MainJudge(ItemInit):
  48. # @staticmethod
  49. def hw_ch2en(self, mean_hw):
  50. """
  51. 作答数据转英文,调接口
  52. :param mean_hw:
  53. :return:
  54. """
  55. ch2en_online = []
  56. def km_ch2en(s):
  57. try:
  58. ch2en1 = KM_ch2en(s)
  59. ch2en_online.extend(ch2en1)
  60. # del ch2en1
  61. # gc.collect()
  62. except:
  63. pass
  64. def hc_ch2en(s):
  65. try:
  66. ch2en2 = haici_zh2en(s)
  67. ch2en_online.extend(ch2en2)
  68. # del ch2en2
  69. # gc.collect()
  70. except:
  71. pass
  72. def bd_ch2en(s):
  73. try:
  74. ch2en3 = ch2en_baidu(s)
  75. ch2en_online.extend(ch2en3)
  76. # del ch2en3
  77. # gc.collect()
  78. except:
  79. pass
  80. # -------线程1-----------
  81. # # IO密集型,用多线程
  82. # threads = [Thread(target=km_ch2en, args=(mean_hw,)), Thread(target=hc_ch2en, args=(mean_hw,)),
  83. # Thread(target=bd_ch2en, args=(mean_hw,))]
  84. # [t.start() for t in threads]
  85. # [t.join() for t in threads]
  86. # -------线程2-----------
  87. all_func = [hc_ch2en, bd_ch2en, km_ch2en]
  88. if self.need_ch2en_hw_num >= 3:
  89. all_func = [hc_ch2en]
  90. elif self.need_ch2en_hw_num >= 2:
  91. all_func = [hc_ch2en, bd_ch2en]
  92. if len(all_func) == 1:
  93. all_func[0](mean_hw)
  94. else:
  95. with ThreadPoolExecutor(max_workers=len(all_func)) as executor:
  96. for func in all_func:
  97. executor.submit(func, mean_hw)
  98. ch2en_online_washed = []
  99. if ch2en_online:
  100. for c in set(ch2en_online):
  101. c = c.replace("'", "'")
  102. new_c = [c]
  103. if len(re.findall("[((].+?[))]", c)) == 1:
  104. c_1 = re.split("[((](.+?)[))]", c)
  105. if c_1[1] in ["s", "es", "ing"]:
  106. new_c = [c_1[0], c_1[0]+c_1[1]]
  107. ch2en_online_washed.extend(new_c)
  108. return mean_hw, ch2en_online_washed
  109. def synonyms_cluster_judge(self, en_ptag):
  110. """
  111. 同义聚类、反义聚类综合判断:1、根据中文找到其英文,再查英文的近反义词
  112. 2、根据英文的近反义词查中文意思再对比
  113. :return:
  114. """
  115. # 2+、教材义的单词判断
  116. if self.word in single_mean_words or self.word in fixed_mean_words_dict \
  117. or self.word in sum(list(mean_fixed_words.values()), []):
  118. self.is_word_with_fixed_mean = 1
  119. fixed_mean = []
  120. if self.word in word2mean_high:
  121. fixed_mean.append(list(word2mean_high[self.word].values()))
  122. if self.word in word2mean_junior:
  123. fixed_mean.append([word2mean_junior[self.word]])
  124. if self.word in fixed_mean_words_dict:
  125. fixed_mean.append([fixed_mean_words_dict[self.word].replace("\n", ";")])
  126. fixed_mean.append([self.fuller_means_simpwashed])
  127. # print(3333333333, ";".join(sum(fixed_mean, [])))
  128. fixed_mean_list = ChWash(self.word, ";".join(sum(fixed_mean, []))).zh_full_wash(source="hyk")
  129. fixed_mean_list = means_split(";".join(fixed_mean_list)) # 切分
  130. if all([True if mean_hw in fixed_mean_list else False for mean_hw in self.mean_hw_list]):
  131. return 1
  132. else:
  133. return 0
  134. else: # 近义反义判断
  135. synonyms, antonyms = [], []
  136. if self.word in syn_km:
  137. synonyms.extend(sum(list(syn_km[self.word]["synonyms"].values()), []))
  138. antonyms.extend(sum(list(syn_km[self.word]["antonyms"].values()), []))
  139. if self.word in syn_ft:
  140. synonyms.extend(syn_ft[self.word]["synonyms"])
  141. antonyms.extend(syn_ft[self.word]["antonyms"])
  142. if self.word in syn_bing:
  143. synonyms.extend(syn_bing[self.word]["synonyms"])
  144. antonyms.extend(syn_bing[self.word]["antonyms"])
  145. if self.word in phrase_syn_km:
  146. synonyms.extend(sum(list(phrase_syn_km[self.word]["synonyms"].values()), []))
  147. antonyms.extend(sum(list(phrase_syn_km[self.word]["antonyms"].values()), []))
  148. if self.word in phrase_syn_bing:
  149. synonyms.extend(phrase_syn_bing[self.word]["synonyms"])
  150. antonyms.extend(phrase_syn_bing[self.word]["antonyms"])
  151. synonyms = list([s.replace("'", "'") for s in set(synonyms)])
  152. antonyms = list([a.replace("'", "'") for a in set(antonyms)])
  153. # print("近义词:", synonyms)
  154. # print("反义词:", antonyms)
  155. # 存在词的近义之间有差别:hw_en的词虽在近义词表中但两者不是同一个层意思
  156. is_all_synonyms = 1
  157. hw2en_equal_word = [] # hw2en与word相同的hw
  158. no_synonyms = 0 # 无近义词
  159. is_all_hw_en_exist = [] # 是否所有的hw都存在en
  160. need_ch2en_online = [] # 需要在线获取en的hw
  161. # ch2en_dict_online = {} # 在线获取的hw-en词典
  162. # ----------统计需要在线获取en的中文作答----------
  163. time1 = time.time()
  164. for mean_hw in self.mean_hw_list:
  165. # len(self.word.split(" ")) == 1 and
  166. if "(" not in mean_hw and "(" not in mean_hw: # 带括号的不考虑
  167. need_ch2en_online.append(mean_hw)
  168. # ----------在线获取en----------
  169. need_ch2en_online = list(set(need_ch2en_online))
  170. need_ch2en_online = [ch for ch in need_ch2en_online if ch not in ch_to_en_online]
  171. ch2en_dict_online = {ch: ch_to_en_online[ch] for ch in need_ch2en_online if ch in ch_to_en_online}
  172. if need_ch2en_online:
  173. max_workers = 1
  174. if len(need_ch2en_online) >= 6:
  175. max_workers = 6
  176. elif len(need_ch2en_online) > 1:
  177. max_workers = len(need_ch2en_online)
  178. self.need_ch2en_hw_num = len(need_ch2en_online) # 需要在线获取en的hw的个数
  179. if max_workers > 1:
  180. with ThreadPoolExecutor(max_workers=max_workers) as executor:
  181. for future in executor.map(self.hw_ch2en, need_ch2en_online):
  182. ch2en_dict_online[future[0]] = future[1]
  183. if future[1]: # 将在线获取得ch2en先保存,每天更新一次
  184. ch2en_logger.info(json.dumps({"{}".format(future[0]): future[1]}, ensure_ascii=False))
  185. else: # 需要在线获取en的hw的个数只有1个
  186. hw, hw2en = self.hw_ch2en(need_ch2en_online[0])
  187. if hw2en:
  188. ch2en_dict_online[hw] = hw2en
  189. ch2en_logger.info(json.dumps({"{}".format(hw): hw2en}, ensure_ascii=False))
  190. # print("在线获取en的时间:", time.time() - time1)
  191. # -----------------根据hw的en与近反义词进行判断---------------------
  192. for mean_hw in self.mean_hw_list:
  193. dict_ch2en = my_config.dict_ch2en
  194. hw_en = []
  195. if mean_hw in dict_ch2en: # 从整理的ch2en获取
  196. hw_en = dict_ch2en[mean_hw]
  197. # print("整理的hw_en:", mean_hw, hw_en)
  198. if mean_hw in ch2en_dict_online and ch2en_dict_online[mean_hw]:
  199. # print("ch2en_online:", ch2en_dict_online[mean_hw])
  200. hw_en.extend(ch2en_dict_online[mean_hw])
  201. # print("all--hw_en:", hw_en) # 近义词: ['southeast', 'southeasterly']
  202. if hw_en:
  203. hw_en = list(set(map(lambda x: x.lower(), hw_en))) # 变小写
  204. # print("变形hw_en:", hw_en)
  205. if any([True for en in hw_en if en in antonyms and en not in synonyms]): # 反义词
  206. return 0
  207. if any([True for en in hw_en if en == self.word]): # hw_en与题干相等
  208. if any([re.search(mean_hw+"[的地]$", u) for u in self.union_means]): # 是错,可能也是对(如果有非adj的近义的话)
  209. return -1
  210. if not self.mean_hw_cutted:
  211. continue
  212. else: # 多个mean_hw时,且被截取后,只要有一个存在就可以
  213. return 1
  214. # synonyms.append(self.word)
  215. if en_ptag == "n-phrase" and mean_hw not in self.mean_hw_cutted and \
  216. any([True for en in hw_en if en != self.word and re.search("^"+en + "[a-z']{,4} [a-z]{3,}$", self.word)]):
  217. return 0
  218. # print("hw_en:::", hw_en)
  219. if any([True for en in hw_en if en != self.word and re.search("^"+en + "[a-z']{,4}$", self.word)]):
  220. # steady progress 与steady progression # utter与utterly
  221. equal_en = [en for en in hw_en if en != self.word and re.search("^"+en + "[a-z']{,4}$", self.word)
  222. and en + 'ly' == self.word and en_ptag in ["RB", "ADV"]]
  223. if equal_en:
  224. return 0
  225. continue
  226. if any([True for en in hw_en if en == self.word.replace(" ", "") and
  227. self.word.replace(" ", "") not in ["breakdown"]]): # 对题干去空格与en同
  228. hw2en_equal_word.append(mean_hw)
  229. elif not self.mean_hw_cutted and any([True for en in hw_en if en != self.word
  230. and re.search("\s"+en+"|"+en + "\s", self.word)
  231. and en_ptag == "n-phrase"]): # en in self.word 名词短语,作答只译了一半的情况
  232. return 0
  233. elif not synonyms:
  234. no_synonyms = 1
  235. break
  236. elif all([True if en not in synonyms else False for en in hw_en]): # hw的en都不在近义词中
  237. # 是否限定形容词、副词走近义不满足时继续走相似度匹配!!!!
  238. if not self.is_phrase: # 单词
  239. if re.search("[\u4e00-\u9fa5]+的[\u4e00-\u9fa5]", mean_hw): # 例:virtue(高尚的品德)
  240. return -1
  241. if len(mean_hw) <= 2:
  242. is_all_synonyms = 0 # 存在不是近义
  243. else:
  244. if not self.sbert_row_maxsimi_res:
  245. embs = item2emb_all(self.mean_hw_list + list(self.union_means))
  246. self.sbert_simi_res = util.cos_sim(embs[0:len(self.mean_hw_list)],
  247. embs[len(self.mean_hw_list):])
  248. self.sbert_row_maxsimi_res = [max(s) for s in self.sbert_simi_res.tolist()]
  249. # print("近义词聚类中new_row_maxsimi_res:", self.sbert_row_maxsimi_res)
  250. if max(self.sbert_row_maxsimi_res) >= 0.9:
  251. return -1
  252. else:
  253. is_all_synonyms = 0 # 存在不是近义
  254. else: # 短语继续走后面的相似度判断
  255. return -1
  256. else:
  257. # 存在hw的en虽然近义词中,但在哪个释义上属于近义是不一样的
  258. syns = [en for en in hw_en if en in synonyms] # 在synonyms中的en
  259. # print("在synonyms中的en:", syns)
  260. for syn in syns:
  261. _, syn_mean = get_mean_in_dict(syn.strip(), mod="all")
  262. # syn_mean = "" # en的中文释义
  263. # if syn in words_dict:
  264. # syn_mean += words_dict[syn]
  265. # if syn in more_words_dict:
  266. # syn_mean += ";" + more_words_dict[syn]
  267. # if syn in phrases_dict_tk:
  268. # syn_mean += ";" + phrases_dict_tk[syn]
  269. # print(11111,syn_mean)
  270. # mean_hw在syn_mean中可能属于en的释义也可能不是
  271. if re.search("(?<![\u4e00-\u9fa5])"+mean_hw+"(?![\u4e00-\u9fa5])", syn_mean):
  272. return -1
  273. # 根据词性筛选(存在有的近义词是错的)
  274. pos_mean = re.findall("(?<!=[a-z])([a-z]+)\.", syn_mean)
  275. pos_fuller_mean = re.findall("(?<!=[a-z])([a-z]+)\.", self.fuller_means_nowashed)
  276. if re.search("n\.\s*"+mean_hw, syn_mean) and pos_fuller_mean and \
  277. all([True if a[0] in ["v", "V"] else False for a in pos_fuller_mean]):
  278. return 0
  279. if (re.search("的$", mean_hw) is None and pos_tag_han(mean_hw, flag="by_list") != 'a') and \
  280. (phrase_classify(syn) == "JJ" or (pos_mean and
  281. all([True if a[0] == "adj" else False for a in pos_mean]))):
  282. return 0
  283. if not self.sbert_row_maxsimi_res:
  284. embs = item2emb_all(self.mean_hw_list + list(self.union_means))
  285. self.sbert_simi_res = util.cos_sim(embs[0:len(self.mean_hw_list)], embs[len(self.mean_hw_list):])
  286. self.sbert_row_maxsimi_res = [max(s) for s in self.sbert_simi_res.tolist()]
  287. # print("近义词聚类中new_row_maxsimi_res:", self.sbert_row_maxsimi_res)
  288. if max(self.sbert_row_maxsimi_res) < 0.6:
  289. return 0
  290. else:
  291. return -1
  292. # # 还需要判断一下,存在近义词聚类不全的情况
  293. # en_means_text = ""
  294. # for en in hw_en:
  295. # en_means_text += get_mean_in_dict(en, mod="all")
  296. # all_means = means_split(en_means_text) # 切分
  297. # all_means, _ = wash(all_means, pos="standard")
  298. # # if any([True for j in all_means if j in self.ans_list or j in self.fuller_means_simpwashed]):
  299. # # print(all_means)
  300. # # print(self.fuller_means_simpwashed, other_mean)
  301. # # print([j for j in all_means if j in self.ans_list or j in self.fuller_means_simpwashed])
  302. # if any([True for j in all_means if j in self.ans_list or j in other_mean]):
  303. # hw2en_equal_word.append(mean_hw)
  304. # else:
  305. # is_all_synonyms = 0
  306. # break
  307. else:
  308. is_all_hw_en_exist.append(-1)
  309. # return -1
  310. if hw2en_equal_word:
  311. self.mean_hw_list = [j for j in self.mean_hw_list if j not in hw2en_equal_word]
  312. if not is_all_synonyms:
  313. return 0
  314. elif no_synonyms:
  315. return -1
  316. elif -1 in is_all_hw_en_exist:
  317. return -1
  318. else:
  319. return 1
  320. def cilin_similarity_judge(self):
  321. """
  322. 词林相似度判断
  323. :return:
  324. """
  325. scores = []
  326. t3 = time.time()
  327. hb = HybridSim()
  328. for idi, mean_hw in enumerate(self.mean_hw_list):
  329. if re.search("[(()),,、]", mean_hw):
  330. continue
  331. one_scores = []
  332. for idj, ans in enumerate(self.ans_list):
  333. # print(mean_hw, " vs ", ans)
  334. if re.search("[(()),,、]", ans):
  335. continue
  336. cilin_score = hb.get_Final_sim(mean_hw, ans)
  337. # print("cilin_score:", cilin_score)
  338. if cilin_score >= 0.9 or cilin_score == -2: # -1需不需要判断一下????
  339. # 用han相似度判断再一下
  340. if not self.simires_han_rawshape:
  341. self.han_simi_res, self.simires_han_rawshape = han_similarity(self.word, self.mean_hw_list,
  342. self.ans_list, self.cutted_words)
  343. # mean_hw = re.sub(r"^(.{2,})[地了的得等]$", r"\1", mean_hw)
  344. # ans = re.sub(r"^(.{2,})[地了的得等]$", r"\1", ans)
  345. # han_score = han_similarity(self.word, [mean_hw], [ans], self.cutted_words)[0][0]
  346. han_score = self.han_simi_res[idi][idj]
  347. if han_score > 0.4:
  348. if cilin_score == -2: # 词林中没找到这个词
  349. one_scores.append(han_score)
  350. else:
  351. one_scores.append(cilin_score)
  352. elif han_score == 0.0:
  353. # return 0
  354. one_scores.append(0.0)
  355. else:
  356. one_scores.append(han_score)
  357. else:
  358. one_scores.append(cilin_score)
  359. if one_scores:
  360. if max(one_scores) == 0.0 and min(one_scores) != -1:
  361. return 0
  362. scores.append(max(one_scores))
  363. # print("\n4、同义词词林相似度得分:", scores, "时间:", time.time() - t3)
  364. if all([True if s >= 0.9 else False for s in scores]): # 相似度都很高时
  365. return 1
  366. else:
  367. # 将相似度为0.9以上的剔除,在后面逻辑中无需比较
  368. idx_09 = [i for i, s in enumerate(scores) if s >= 0.9]
  369. if idx_09:
  370. self.mean_hw_list = [mean for i, mean in enumerate(self.mean_hw_list) if i not in idx_09]
  371. return scores
  372. def score_res(self, general_score):
  373. if self.mod in ["book", "all"]:
  374. if self.score_onbook == -1:
  375. self.score_onbook = general_score
  376. if self.mod == "all":
  377. return general_score, self.score_onbook
  378. elif self.mod == "book":
  379. return self.score_onbook
  380. else:
  381. return general_score
  382. # @func_set_timeout(1.2)
  383. def judge_err_wrriten(self):
  384. """
  385. 错别字判断得分
  386. :return:
  387. """
  388. # time2 = time.time()
  389. noerr_res, char_similar_score, zhword_judged = err_mean_judge(self.mean_hw_list, self.ans_list)
  390. if noerr_res < 1:
  391. # print("纠错所花时间::", time.time() - time2)
  392. if not KM_cidian_check(zhword_judged):
  393. # 数据库搜索判断
  394. if not youdao_cidian_check(zhword_judged):
  395. self.score_on_err_wrriten = 0
  396. else:
  397. noerr_res = 1 # 表示没有错误
  398. # print("纠错所花时间2::", time.time() - time2)
  399. else:
  400. noerr_res = 1 # 表示没有错误
  401. return noerr_res
  402. def emb_similar(self, en, hw_list, ans_given, is_token=0, noerr=1, flag=0):
  403. """
  404. 向量语义相似度
  405. is_token:表示手写答案是否为分词后的
  406. flag:为1表示传cutted_words
  407. :return:
  408. """
  409. score = 0
  410. # sentences-bert模型:适用长文本
  411. # embs = item2emb_cn(hw_list + ans_given)
  412. # simi_res = util.cos_sim(embs[0:len(hw_list)], embs[len(hw_list):])
  413. # row_maxsimi_res = [max(s) for s in simi_res.tolist()]
  414. # hanlp 语义相似度模型:适用短文本
  415. st7 = time.time()
  416. if flag == 1:
  417. self.han_simi_res, _ = han_similarity(en, hw_list, ans_given, self.cutted_words, is_token)
  418. else:
  419. self.han_simi_res, _ = han_similarity(en, hw_list, ans_given, [], is_token)
  420. # print("simi_res:", hw_list, ans_given, self.han_simi_res)
  421. print("hanlp模型所花时间:", time.time() - st7)
  422. row_maxsimi_res = [max(s) for s in self.han_simi_res if s]
  423. if is_token:
  424. if all([True if i * noerr > 0.9 else False for i in row_maxsimi_res]):
  425. score = 1
  426. elif min(row_maxsimi_res) * noerr == 0 and hw_list[row_maxsimi_res.index(min(row_maxsimi_res))] \
  427. not in self.mean_hw_cutted: # not is_token时
  428. score = 0
  429. elif max(row_maxsimi_res) * noerr >= 0.895:
  430. st8 = time.time()
  431. embs = item2emb_all(hw_list + ans_given) # 和前面不一样,需单词计算
  432. self.sbert_simi_res = util.cos_sim(embs[0:len(hw_list)], embs[len(hw_list):]).tolist()
  433. self.sbert_row_maxsimi_res = [max(s) for s in self.sbert_simi_res]
  434. # print("sentences-bert模型:", self.sbert_simi_res)
  435. print("sentences-bert模型所花时间:", time.time() - st8)
  436. if max(self.sbert_row_maxsimi_res) < 0.45:
  437. score = max(self.sbert_row_maxsimi_res)
  438. else:
  439. score = 1
  440. for n, one_res in enumerate(self.han_simi_res): # 两模型的索引是不同步的,还需调整!!!!
  441. if max(one_res) > 0.9:
  442. maxres_idx = one_res.index(max(one_res))
  443. if maxres_idx < len(self.sbert_simi_res[n]) and self.sbert_simi_res[n][maxres_idx] < 0.45:
  444. score = 0
  445. break
  446. else:
  447. score = max(row_maxsimi_res) * noerr
  448. # print("模型语义相似度得分:{}------{}vs{},时间:{}".format(row_maxsimi_res, str(hw_list),
  449. # str(ans_given), time.time() - t4))
  450. return score
  451. # @func_set_timeout(6)
  452. def main_judge(self, again=0):
  453. """
  454. 语义等价判断算法综合:1>>根据英语短词、短语在题库中获取其意思,生成list,再比对
  455. 2>>词林相似度
  456. 3>>语义相似度
  457. 4>>针对2、3相似度方法的缺陷增加对中文、英文近反义的辅助判断
  458. 教材义批改时:答案比对为0时还需要走相似度比较;不属于教材中的单词则走广义批改
  459. en_word:一个英语单词或短语
  460. zh_mean_hw: 手写的中文翻译, (只要有一个意思答对算对吗?)
  461. :return:
  462. """
  463. # ------基本判断:答案比对-------
  464. st1 = time.time()
  465. score = self.simp_judge()
  466. print("简单判断时间:", time.time()-st1)
  467. if score == "0" or score == 1:
  468. return int(score)
  469. else:
  470. if not self.mean_hw_list:
  471. return 0
  472. # -------含有错误意思的单词单独判断---------特殊方法特殊处理-----------
  473. washed_word = re.sub("…+\s*$", "", self.word).strip()
  474. if washed_word in errmean_words and any([True for hw in self.mean_hw_list
  475. if re.sub("…+\s*$", "", hw).strip() in errmean_words[washed_word]]):
  476. return 0
  477. if washed_word in errmean_en_dict or self.err_mean_list:
  478. temp_mean_hw_list = [re.sub("(?<!=[….])(…+|\.{3,})", "…", m) for m in self.mean_hw_list]
  479. errmean_list = self.err_mean_list
  480. if washed_word in errmean_en_dict:
  481. errmean_list.extend(re.split("[;;,,]", errmean_en_dict[washed_word]))
  482. if any([True for hw in temp_mean_hw_list if re.sub("…+\s*$", "", hw).strip() in errmean_list]):
  483. return 0
  484. # --------------特殊情况处理--------------------------
  485. # 例:生效与使生效不同
  486. try:
  487. if any([True for m in self.mean_hw_list if re.search(r"[使令对让].{,2}" + m, ";".join(self.union_means))]):
  488. return 0
  489. if any([True for u in self.union_means if re.search(r"[使令对让].{,2}" + u, ";".join(self.mean_hw_list))
  490. and re.search(r"(?<=[);])[使令对让].+",
  491. ";" + ";".join(self.union_means)) is None]):
  492. return 0
  493. except:
  494. pass
  495. # 的 得 地
  496. if any([True for hw in self.mean_hw_list if re.search("[\u4e00-\u9fa5]{2,}的[\u4e00-\u9fa5]+", hw) and \
  497. re.sub(r"([\u4e00-\u9fa5]{2,})的([\u4e00-\u9fa5]+)", r"\1得\2", hw) in ";".join(self.union_means)
  498. and all([True if re.search("的(?!$)", a) is None else False for a in self.union_means])]):
  499. return 0
  500. # xxx for/of
  501. w1 = re.search("^(.+?) (for|of)$", self.word.strip())
  502. if w1 and phrase_classify(w1.group(1)) in ["n-phrase", "NN"]:
  503. _, part_mean = get_mean_in_dict(w1.group(1).strip(), mod="all")
  504. part_mean_list = means_split(part_mean)
  505. if any([True for hw in self.mean_hw_list if hw in part_mean_list]):
  506. return 0
  507. # 若是短语,先分析下词性(是动词短语还是adj/n短语)
  508. ptag = phrase_classify(self.word)
  509. # print("ptag:::", ptag)
  510. if ptag == "v-phrase" and any([True for h in self.mean_hw_list if re.search(r"(.+)的$", h)]):
  511. return 0
  512. if ptag == 'prep-phrase' and self.word.split(" ")[0] in ["in", "with", "on", "at"] \
  513. and any([True for h in self.mean_hw_list if re.search("的[\u4e00-\u9fa5]+[^中上下]$", h)
  514. and re.search("^在.+?的[\u4e00-\u9fa5]+$", h) is None]): # with reason vs …的理由
  515. return 0
  516. # 忽略的和可忽略的不同
  517. if re.search("ble$", self.word) and any([True for hw in self.mean_hw_list
  518. if re.search("可以?" + hw + "的?", self.fuller_means_simpwashed)]):
  519. return 0
  520. # 针对形容词填成了名词的情况
  521. if any([True for hw in self.mean_hw_list if hw + "的" in self.union_means and ptag == "JJ"]):
  522. return 0
  523. union_mean_txt = self.ans_given + ";" + self.fuller_means_simpwashed
  524. if any([True for hw in self.mean_hw_list if (hw[-1] == "地" and hw[:-1] + "的" in union_mean_txt and
  525. re.search("的(?![\u4e00-\u9fa5])", union_mean_txt) is None)
  526. or (hw[-1] == "的" and hw[:-1] + "地" in union_mean_txt and
  527. re.search("地(?![\u4e00-\u9fa5])", union_mean_txt) is None)]):
  528. return 0
  529. # -----------------近义词聚类判断--------------------
  530. # 需要验证一下去掉这一项判断对最后结果的影响,
  531. # 在教材义批改模式下属于近义词的释义不一定在教材义
  532. if not again or self.syn_judge_score == -2: # -2表示初始值没被修改过(针对mod=all时走两遍的情况)
  533. st1 = time.time()
  534. self.syn_judge_score = self.synonyms_cluster_judge(ptag)
  535. print("聚类判断时间:", time.time() - st1)
  536. score = self.syn_judge_score # 若again=1,则延用上一次的值
  537. print("同义判断得分:", score)
  538. if score != -1:
  539. return score
  540. else:
  541. if self.is_word_with_fixed_mean: # 是意思固定的一些单词
  542. return 0
  543. # -----------错别字判断,只要有一个错的就判错---------------
  544. st1 = time.time()
  545. if not again or self.noerr_res == -1: # -1表示初始值没被修改过(针对mod=all时走两遍的情况)
  546. self.noerr_res = self.judge_err_wrriten()
  547. print("判错时间:", time.time() - st1)
  548. print(self.score_on_err_wrriten)
  549. if self.score_on_err_wrriten == 0: # 若again=1,则延用上一次的值
  550. return 0
  551. noerr_res = self.noerr_res # 若again=1,则延用上一次的值
  552. # ------------------词林相似度比较------------------------------
  553. # 4、词林相似度比较时,前面答案比对过的就不要继续拿来比较了,在simp_judge中更新mean_hw_list,
  554. # scores = self.cilin_similarity_judge()
  555. # if type(scores) == int:
  556. # return scores
  557. # ----------------------模型相似度比较------------------------------
  558. # if not score or max(score) < 0.9 or min(scores) < 0:
  559. if score < 0.9:
  560. # 将公共尾子串去掉,如欣赏外国文学与赞美外国文学,放后面判断
  561. # if self.cutted_words and len(self.word.split(" ")) > 1:
  562. # for j in self.cutted_words[::-1]:
  563. # if j+";" in ";".join(self.ans_list) + ";":
  564. # self.ans_list = [re.sub(j+"$", "", ans) for ans in self.ans_list]
  565. # self.mean_hw_list = [re.sub(j+"$", "", mean) for mean in self.mean_hw_list]
  566. # print(111111,self.mean_hw_list)
  567. score = self.emb_similar(self.word, self.mean_hw_list, self.ans_list, noerr=noerr_res, flag=1) # 第一次
  568. # if not score or score == 1:
  569. if score == 1: # 此时得分为0时还需考虑more_means的情况
  570. self.final_score = score
  571. else:
  572. score_first = score
  573. if self.hw_with_brace:
  574. self.mean_hw_list.extend(sum(self.hw_with_brace.values(), []))
  575. if self.more_mean: # 再调一次相似度模型
  576. # print("more_means:", self.more_mean)
  577. score = self.emb_similar(self.word, self.mean_hw_list, self.more_mean, noerr=noerr_res) # 第二次
  578. if not score or score == 1:
  579. return score
  580. if score_first < 0.1 and score < 0.1:
  581. return 0
  582. if time.time() - self.stime < 3:
  583. if self.cutted_words:
  584. final_score = self.emb_similar(self.word, self.cutted_words, self.ans_list, 1, noerr=noerr_res) # 第三次
  585. if final_score and final_score != 1:
  586. more_means = list(set(self.more_mean) - set(self.ans_list))
  587. final_score = self.emb_similar(self.word, self.cutted_words, more_means, 1, noerr=noerr_res) # 第四次
  588. if final_score < 1:
  589. self.final_score = 0
  590. else:
  591. self.final_score = 0
  592. else:
  593. self.final_score = 1 if score > 0.8 else 0
  594. return self.final_score
  595. def __call__(self):
  596. """
  597. 根据批改模式(教材义book、广义general,all)返回判断得分
  598. 教材义不存在时按广义批改
  599. :return:
  600. """
  601. self.simp_wash()
  602. if not self.mean_hw_list:
  603. if self.mod == "all":
  604. return 0, 0, []
  605. return 0, []
  606. self.getmean_inbook()
  607. if self.mod == "all":
  608. self.mod = "general"
  609. general_score = self.main_judge()
  610. mean_hw_nindict = self.mean_hw_nindict
  611. if self.syn_judge_score in [0, 1]:
  612. book_score = self.syn_judge_score
  613. elif not self.score_on_err_wrriten:
  614. book_score = 0
  615. else:
  616. self.mod = "book"
  617. book_score = self.main_judge(again=1)
  618. return general_score, book_score, mean_hw_nindict
  619. else: # "general"/"book"
  620. try:
  621. score = self.main_judge()
  622. except func_timeout.exceptions.FunctionTimedOut:
  623. print("批改超时")
  624. score = self.final_score
  625. return score, self.mean_hw_nindict
  626. if __name__ == '__main__':
  627. import os
  628. # s = cosine_similarity([[1, 0, 0, 0]], [[1, 0, 0, 0]])
  629. st = time.time()
  630. s, score1 = MainJudge("differ", "不同;区别于",
  631. "vi.相异,不同,不一样 ",
  632. mod="general")() # hw,ans
  633. print("最后得分:", score1, s) # , score_onbook general book
  634. print("时间花费:", time.time() - st)