wrong_written_judgement.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. # -*- coding: utf-8 -*-
  2. import re
  3. import jieba # jieba分词
  4. import requests
  5. from Utils.util import cos_sim
  6. from Utils.han_semantic_similarity import pos_tag_han
  7. import sqlite3
  8. # logger = my_config.myLog(__name__, log_cate="misswritten_log").getlog()
  9. def err_mean_judge(mean_hw_list, ans_list):
  10. """
  11. 错误意思或相反意思判断
  12. 加上一个词典辅助判断
  13. :param :
  14. :return:
  15. """
  16. from Autochecker4Chinese import auto_check
  17. char_similar_score = 1
  18. for ans_hw in mean_hw_list:
  19. # 1、先判断错别字
  20. # corrected_sent, detail = pycorrector.correct('ans_hw') # 需下载的模型太大
  21. # print(auto_check.auto_correct(ans_hw))
  22. if "…" in ans_hw:
  23. ans_hw = re.sub(r"^.*?…+的?(.+)", r"\1", ans_hw)
  24. new_ans_hw = re.sub(r"^(.*)([\u4e00-\u9fa5])\2[地了的得等]", r"\1", ans_hw) # 例:信心满满的
  25. new_ans_hw = re.sub(r"^(.{2,})[地了的得等]$", r"\1", new_ans_hw)
  26. new_ans_hw = re.sub(r"^(.+)出[来去]$", r"\1", new_ans_hw)
  27. new_ans_hw = re.sub(r"^(向前|得到|不是|继续|[使得有我令对让向将好]+)", "", new_ans_hw)
  28. new_ans_hw = re.sub(r"^的+|^(想要|正在)", "", new_ans_hw)
  29. new_ans_hw = re.sub(r"^未([\u4e00-\u9fa5]{2,})", r"\1", new_ans_hw)
  30. new_ans_hw = re.sub("某人[\u4e00-\u9fa5]+某事", "", new_ans_hw)
  31. new_ans_hw = re.sub("某[人物事]", "", new_ans_hw)
  32. new_ans_hw = re.sub(r"^(.*)([\u4e00-\u9fa5])\2", r"\1\2", new_ans_hw) # 例:速率率
  33. if re.search("为$", new_ans_hw) and pos_tag_han(re.sub("为+$", "", new_ans_hw)) == 'v':
  34. new_ans_hw = re.sub("为+$", "", new_ans_hw)
  35. elif re.search("^不", new_ans_hw) and pos_tag_han(re.sub("^不", "", new_ans_hw)) == 'a':
  36. new_ans_hw = re.sub("^不", "", new_ans_hw)
  37. new_ans_hw = re.sub(r"^不([\u4e00-\u9fa5]{2,})", r"\1", new_ans_hw)
  38. if len(new_ans_hw) > 1:
  39. # 错别字判断
  40. may_candidate = auto_check.auto_correct(new_ans_hw, flag="all")
  41. print("new_ans_hw:{}---may_candidate:::{}".format(new_ans_hw, may_candidate))
  42. if may_candidate and new_ans_hw not in may_candidate:
  43. # logger.info("含错别字:{}-{}\nmay_candidate:{}".format(ans_hw, new_ans_hw, may_candidate))
  44. if len(may_candidate) <= 5:
  45. if len(new_ans_hw) == 2 and re.search("的$", new_ans_hw):
  46. continue
  47. if len(new_ans_hw) > 2:
  48. new_ans_hw2 = re.sub(r"^(绝|开始|才|最|刚+|在)", "", new_ans_hw)
  49. new_ans_hw2 = re.sub(r"会$", "", new_ans_hw2)
  50. if new_ans_hw2 != new_ans_hw and len(new_ans_hw2) > 1:
  51. may_candidate = auto_check.auto_correct(new_ans_hw2, flag="all")
  52. if may_candidate and new_ans_hw2 not in may_candidate:
  53. pass
  54. else:
  55. continue
  56. elif len(new_ans_hw) >= 4:
  57. jieba_tok = jieba.lcut(new_ans_hw)
  58. may_right = []
  59. for tok in jieba_tok:
  60. may_candidate = auto_check.auto_correct(tok, flag="all")
  61. if may_candidate and tok not in may_candidate:
  62. may_right.append(0)
  63. else:
  64. may_right.append(1)
  65. break
  66. if sum(may_right)/len(may_right) == 1:
  67. continue
  68. else:
  69. return 0.91, char_similar_score, new_ans_hw
  70. return 0, char_similar_score, new_ans_hw
  71. # 字符级相似度
  72. if ans_hw not in ans_list and all([True if len(ans_hw) <= 4 and cos_sim(ans, ans_hw) < 0.2
  73. else False for ans in ans_list]):
  74. char_similar_score = 0
  75. # if may_candidate != new_ans_hw and may_candidate not in ans_list and
  76. # cos_sim(may_candidate, new_ans_hw) < 0.8: # 暂时,例“每一个”与“每个”
  77. # return 0
  78. return 1, char_similar_score, ""
  79. # 暂时先不考虑参考答案不全
  80. def KM_cidian_check(text):
  81. """
  82. KM词典,判断中文词语是否存在
  83. :param text:
  84. :return: 1:存在 0:不存在
  85. """
  86. word = str(text).strip() # .replace(" ", "%20").replace("!", " ") # .replace(".", "")
  87. try:
  88. r = requests.get("https://kmcha.com/cidian/{}".format(word), timeout=0.5) # 费时间
  89. except:
  90. return 0
  91. text = r.content.decode('utf8')
  92. # print(text)
  93. if re.search("<strong>.{2,4}释义</strong>", text):
  94. return 1
  95. elif re.search("<p>[\s\n]*<a href=.*?</a>[\s\n]*</p>[\s\n]*</div>", text, flags=re.S) is None:
  96. return 1
  97. else:
  98. related_ci = []
  99. a_info = re.findall("<div>[\s\n]*<p>[\s\n]*<strong>相关查询</strong>[\s\n]*</p>[\s\n]*<p>(.+?)</p>[\s\n]*</div>",
  100. text, flags=re.S)
  101. if a_info:
  102. related_ci.extend(re.findall("<a href=[^>]*?>(.+?)</a>", a_info[0]))
  103. print("related_ci:", related_ci)
  104. # if word not in related_ci:
  105. # return 0
  106. # else:
  107. new_related_ci = related_ci[:-2]
  108. print(new_related_ci)
  109. # if len(text) <= 2:
  110. related_ci_text = ";".join(new_related_ci)
  111. if word in new_related_ci or word in related_ci_text:
  112. return 1
  113. elif len(text) <= 3:
  114. return 0
  115. else:
  116. exist_w = ""
  117. idx = 0
  118. for i in range(len(word)):
  119. if word[i] in related_ci_text:
  120. exist_w += word[i]
  121. idx += 1
  122. else:
  123. break
  124. if len(exist_w) >= 2 and len(word[idx:]) >= 2:
  125. from Autochecker4Chinese import auto_check
  126. may_candidate = auto_check.auto_correct(word[idx:], flag="all")
  127. if may_candidate and word[idx:] in may_candidate:
  128. return 1
  129. else:
  130. return 0
  131. else:
  132. return 0
  133. # return 0
  134. def youdao_cidian_check(text):
  135. """
  136. 1:存在 0:不存在
  137. :return:
  138. """
  139. try:
  140. r = requests.get("http://dict.youdao.com/w/{}/#keyfrom=dict2.top".format(text), timeout=0.5)
  141. except:
  142. return 0
  143. all_content = r.content.decode('utf-8')
  144. # print(all_content)
  145. if "您要找的是不是:" in all_content:
  146. return 0
  147. return 1
  148. def check_by_bigdata(txt):
  149. """
  150. 通过大数据搜索判断搭配是否常见==>判断正误
  151. :return:
  152. """
  153. db_file = r'F:\zwj\WiKi\wiki_zh.db'
  154. conn_seg1 = sqlite3.connect(database="F:\zwj\WiKi\wiki_segs_zh_1.db")
  155. conn_seg2 = sqlite3.connect(database="F:\zwj\WiKi\wiki_segs_zh_2.db")
  156. conn_seg3 = sqlite3.connect(database="F:\zwj\WiKi\wiki_segs_zh_3.db")
  157. cur1 = conn_seg1.cursor()
  158. cur2 = conn_seg2.cursor()
  159. cur3 = conn_seg3.cursor()
  160. def search(ss, mod=0): # mod=0:无需返回查询结果
  161. res = []
  162. cur1.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
  163. res1 = cur1.fetchone()
  164. if mod == 1:
  165. if res1:
  166. res.extend(res1[1].split(","))
  167. cur2.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
  168. res2 = cur2.fetchone()
  169. if res2:
  170. res.extend(res2[1].split(","))
  171. cur3.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
  172. res3 = cur3.fetchone()
  173. if res3:
  174. res.extend(res3[1].split(","))
  175. res = set(res)
  176. return res
  177. else:
  178. if not cur1.fetchone():
  179. cur2.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
  180. if not cur2.fetchone():
  181. cur3.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
  182. if not cur3.fetchone():
  183. return 0
  184. return 1
  185. def text_seg(txt):
  186. segs = jieba.lcut(txt.strip(), cut_all=False) # 精确模式
  187. f_stop = open(r'G:\zwj\WL\en2cn\files\main\stopwords.txt', encoding='utf-8') # 自己的中文停用词表
  188. stopwords = [line.strip() for line in f_stop] # strip() 方法用于移除字符串头尾指定的字符(默认为空格)
  189. f_stop.close()
  190. segs = [re.sub("^[a-zA-Z\d_]+|[a-zA-Z\d_]+$", "", wl) for wl in segs]
  191. new_segs = [s.strip() for s in segs if
  192. len(s) > 1 and re.search("[\u4e00-\u9fa5]", s) and s.strip() not in stopwords]
  193. # if len(segs) >= 2: # 组合效果不太好
  194. # for n in range(len(segs) - 1):
  195. # new_segs.append(segs[n] + segs[n + 1])
  196. return new_segs
  197. term_data = search(txt.strip())
  198. if not term_data:
  199. segs = text_seg(txt)
  200. if segs:
  201. # print(segs)
  202. comid = search(segs[0], mod=1)
  203. if len(segs) > 1:
  204. for seg in segs[1:]:
  205. t_id = search(seg, mod=1)
  206. comid = comid.intersection(t_id)
  207. if not comid:
  208. return 0
  209. elif len(comid) > 5:
  210. # print(comid)
  211. return -1 # 不确定
  212. else:
  213. conn = sqlite3.connect(database=db_file)
  214. cur = conn.cursor()
  215. sourcetxt = ""
  216. for id in comid:
  217. cur.execute("SELECT * FROM wiki_zh_table WHERE seg_id=?", (id,))
  218. restxt = cur.fetchone()
  219. if restxt:
  220. sourcetxt += restxt[1]
  221. cur.close()
  222. if txt not in sourcetxt:
  223. return 0
  224. elif not comid:
  225. return 0
  226. cur1.close()
  227. cur2.close()
  228. cur3.close()
  229. return 1
  230. if __name__ == '__main__':
  231. import time
  232. mean_hw_list = []
  233. ans_list = []
  234. # err_mean_judge(mean_hw_list, ans_list)
  235. # a = KM_cidian_check("想要做某事")
  236. # print(a)
  237. # a = youdao_cidian_check("掌上明住")
  238. # print(a)
  239. t1 = time.time()
  240. # b = check_by_bigdata("录背室")
  241. # b = youdao_cidian_check("录背室")
  242. b = KM_cidian_check("录背室")
  243. # b = youdao_cidian_check("以出认可")
  244. print(b)
  245. print(time.time()-t1)