# -*- coding: utf-8 -*- import re import jieba # jieba分词 import requests from Utils.util import cos_sim from Utils.han_semantic_similarity import pos_tag_han import sqlite3 # logger = my_config.myLog(__name__, log_cate="misswritten_log").getlog() def err_mean_judge(mean_hw_list, ans_list): """ 错误意思或相反意思判断 加上一个词典辅助判断 :param : :return: """ from Autochecker4Chinese import auto_check char_similar_score = 1 for ans_hw in mean_hw_list: # 1、先判断错别字 # corrected_sent, detail = pycorrector.correct('ans_hw') # 需下载的模型太大 # print(auto_check.auto_correct(ans_hw)) if "…" in ans_hw: ans_hw = re.sub(r"^.*?…+的?(.+)", r"\1", ans_hw) new_ans_hw = re.sub(r"^(.*)([\u4e00-\u9fa5])\2[地了的得等]", r"\1", ans_hw) # 例:信心满满的 new_ans_hw = re.sub(r"^(.{2,})[地了的得等]$", r"\1", new_ans_hw) new_ans_hw = re.sub(r"^(.+)出[来去]$", r"\1", new_ans_hw) new_ans_hw = re.sub(r"^(向前|得到|不是|继续|[使得有我令对让向将好]+)", "", new_ans_hw) new_ans_hw = re.sub(r"^的+|^(想要|正在)", "", new_ans_hw) new_ans_hw = re.sub(r"^未([\u4e00-\u9fa5]{2,})", r"\1", new_ans_hw) new_ans_hw = re.sub("某人[\u4e00-\u9fa5]+某事", "", new_ans_hw) new_ans_hw = re.sub("某[人物事]", "", new_ans_hw) new_ans_hw = re.sub(r"^(.*)([\u4e00-\u9fa5])\2", r"\1\2", new_ans_hw) # 例:速率率 if re.search("为$", new_ans_hw) and pos_tag_han(re.sub("为+$", "", new_ans_hw)) == 'v': new_ans_hw = re.sub("为+$", "", new_ans_hw) elif re.search("^不", new_ans_hw) and pos_tag_han(re.sub("^不", "", new_ans_hw)) == 'a': new_ans_hw = re.sub("^不", "", new_ans_hw) new_ans_hw = re.sub(r"^不([\u4e00-\u9fa5]{2,})", r"\1", new_ans_hw) if len(new_ans_hw) > 1: # 错别字判断 may_candidate = auto_check.auto_correct(new_ans_hw, flag="all") print("new_ans_hw:{}---may_candidate:::{}".format(new_ans_hw, may_candidate)) if may_candidate and new_ans_hw not in may_candidate: # logger.info("含错别字:{}-{}\nmay_candidate:{}".format(ans_hw, new_ans_hw, may_candidate)) if len(may_candidate) <= 5: if len(new_ans_hw) == 2 and re.search("的$", new_ans_hw): continue if len(new_ans_hw) > 2: new_ans_hw2 = re.sub(r"^(绝|开始|才|最|刚+|在)", "", new_ans_hw) new_ans_hw2 = re.sub(r"会$", "", new_ans_hw2) if new_ans_hw2 != new_ans_hw and len(new_ans_hw2) > 1: may_candidate = auto_check.auto_correct(new_ans_hw2, flag="all") if may_candidate and new_ans_hw2 not in may_candidate: pass else: continue elif len(new_ans_hw) >= 4: jieba_tok = jieba.lcut(new_ans_hw) may_right = [] for tok in jieba_tok: may_candidate = auto_check.auto_correct(tok, flag="all") if may_candidate and tok not in may_candidate: may_right.append(0) else: may_right.append(1) break if sum(may_right)/len(may_right) == 1: continue else: return 0.91, char_similar_score, new_ans_hw return 0, char_similar_score, new_ans_hw # 字符级相似度 if ans_hw not in ans_list and all([True if len(ans_hw) <= 4 and cos_sim(ans, ans_hw) < 0.2 else False for ans in ans_list]): char_similar_score = 0 # if may_candidate != new_ans_hw and may_candidate not in ans_list and # cos_sim(may_candidate, new_ans_hw) < 0.8: # 暂时,例“每一个”与“每个” # return 0 return 1, char_similar_score, "" # 暂时先不考虑参考答案不全 def KM_cidian_check(text): """ KM词典,判断中文词语是否存在 :param text: :return: 1:存在 0:不存在 """ word = str(text).strip() # .replace(" ", "%20").replace("!", " ") # .replace(".", "") try: r = requests.get("https://kmcha.com/cidian/{}".format(word), timeout=0.5) # 费时间 except: return 0 text = r.content.decode('utf8') # print(text) if re.search(".{2,4}释义", text): return 1 elif re.search("

[\s\n]*[\s\n]*

[\s\n]*", text, flags=re.S) is None: return 1 else: related_ci = [] a_info = re.findall("
[\s\n]*

[\s\n]*相关查询[\s\n]*

[\s\n]*

(.+?)

[\s\n]*
", text, flags=re.S) if a_info: related_ci.extend(re.findall("
]*?>(.+?)", a_info[0])) print("related_ci:", related_ci) # if word not in related_ci: # return 0 # else: new_related_ci = related_ci[:-2] print(new_related_ci) # if len(text) <= 2: related_ci_text = ";".join(new_related_ci) if word in new_related_ci or word in related_ci_text: return 1 elif len(text) <= 3: return 0 else: exist_w = "" idx = 0 for i in range(len(word)): if word[i] in related_ci_text: exist_w += word[i] idx += 1 else: break if len(exist_w) >= 2 and len(word[idx:]) >= 2: from Autochecker4Chinese import auto_check may_candidate = auto_check.auto_correct(word[idx:], flag="all") if may_candidate and word[idx:] in may_candidate: return 1 else: return 0 else: return 0 # return 0 def youdao_cidian_check(text): """ 1:存在 0:不存在 :return: """ try: r = requests.get("http://dict.youdao.com/w/{}/#keyfrom=dict2.top".format(text), timeout=0.5) except: return 0 all_content = r.content.decode('utf-8') # print(all_content) if "您要找的是不是:" in all_content: return 0 return 1 def check_by_bigdata(txt): """ 通过大数据搜索判断搭配是否常见==>判断正误 :return: """ db_file = r'F:\zwj\WiKi\wiki_zh.db' conn_seg1 = sqlite3.connect(database="F:\zwj\WiKi\wiki_segs_zh_1.db") conn_seg2 = sqlite3.connect(database="F:\zwj\WiKi\wiki_segs_zh_2.db") conn_seg3 = sqlite3.connect(database="F:\zwj\WiKi\wiki_segs_zh_3.db") cur1 = conn_seg1.cursor() cur2 = conn_seg2.cursor() cur3 = conn_seg3.cursor() def search(ss, mod=0): # mod=0:无需返回查询结果 res = [] cur1.execute("SELECT * FROM seg_table WHERE seg=?", (ss,)) res1 = cur1.fetchone() if mod == 1: if res1: res.extend(res1[1].split(",")) cur2.execute("SELECT * FROM seg_table WHERE seg=?", (ss,)) res2 = cur2.fetchone() if res2: res.extend(res2[1].split(",")) cur3.execute("SELECT * FROM seg_table WHERE seg=?", (ss,)) res3 = cur3.fetchone() if res3: res.extend(res3[1].split(",")) res = set(res) return res else: if not cur1.fetchone(): cur2.execute("SELECT * FROM seg_table WHERE seg=?", (ss,)) if not cur2.fetchone(): cur3.execute("SELECT * FROM seg_table WHERE seg=?", (ss,)) if not cur3.fetchone(): return 0 return 1 def text_seg(txt): segs = jieba.lcut(txt.strip(), cut_all=False) # 精确模式 f_stop = open(r'G:\zwj\WL\en2cn\files\main\stopwords.txt', encoding='utf-8') # 自己的中文停用词表 stopwords = [line.strip() for line in f_stop] # strip() 方法用于移除字符串头尾指定的字符(默认为空格) f_stop.close() segs = [re.sub("^[a-zA-Z\d_]+|[a-zA-Z\d_]+$", "", wl) for wl in segs] new_segs = [s.strip() for s in segs if len(s) > 1 and re.search("[\u4e00-\u9fa5]", s) and s.strip() not in stopwords] # if len(segs) >= 2: # 组合效果不太好 # for n in range(len(segs) - 1): # new_segs.append(segs[n] + segs[n + 1]) return new_segs term_data = search(txt.strip()) if not term_data: segs = text_seg(txt) if segs: # print(segs) comid = search(segs[0], mod=1) if len(segs) > 1: for seg in segs[1:]: t_id = search(seg, mod=1) comid = comid.intersection(t_id) if not comid: return 0 elif len(comid) > 5: # print(comid) return -1 # 不确定 else: conn = sqlite3.connect(database=db_file) cur = conn.cursor() sourcetxt = "" for id in comid: cur.execute("SELECT * FROM wiki_zh_table WHERE seg_id=?", (id,)) restxt = cur.fetchone() if restxt: sourcetxt += restxt[1] cur.close() if txt not in sourcetxt: return 0 elif not comid: return 0 cur1.close() cur2.close() cur3.close() return 1 if __name__ == '__main__': import time mean_hw_list = [] ans_list = [] # err_mean_judge(mean_hw_list, ans_list) # a = KM_cidian_check("想要做某事") # print(a) # a = youdao_cidian_check("掌上明住") # print(a) t1 = time.time() # b = check_by_bigdata("录背室") # b = youdao_cidian_check("录背室") b = KM_cidian_check("录背室") # b = youdao_cidian_check("以出认可") print(b) print(time.time()-t1)