123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267 |
- # -*- coding: utf-8 -*-
- import re
- import jieba # jieba分词
- import requests
- from Utils.util import cos_sim
- from Utils.han_semantic_similarity import pos_tag_han
- import sqlite3
- # logger = my_config.myLog(__name__, log_cate="misswritten_log").getlog()
- def err_mean_judge(mean_hw_list, ans_list):
- """
- 错误意思或相反意思判断
- 加上一个词典辅助判断
- :param :
- :return:
- """
- from Autochecker4Chinese import auto_check
- char_similar_score = 1
- for ans_hw in mean_hw_list:
- # 1、先判断错别字
- # corrected_sent, detail = pycorrector.correct('ans_hw') # 需下载的模型太大
- # print(auto_check.auto_correct(ans_hw))
- if "…" in ans_hw:
- ans_hw = re.sub(r"^.*?…+的?(.+)", r"\1", ans_hw)
- new_ans_hw = re.sub(r"^(.*)([\u4e00-\u9fa5])\2[地了的得等]", r"\1", ans_hw) # 例:信心满满的
- new_ans_hw = re.sub(r"^(.{2,})[地了的得等]$", r"\1", new_ans_hw)
- new_ans_hw = re.sub(r"^(.+)出[来去]$", r"\1", new_ans_hw)
- new_ans_hw = re.sub(r"^(向前|得到|不是|继续|[使得有我令对让向将好]+)", "", new_ans_hw)
- new_ans_hw = re.sub(r"^的+|^(想要|正在)", "", new_ans_hw)
- new_ans_hw = re.sub(r"^未([\u4e00-\u9fa5]{2,})", r"\1", new_ans_hw)
- new_ans_hw = re.sub("某人[\u4e00-\u9fa5]+某事", "", new_ans_hw)
- new_ans_hw = re.sub("某[人物事]", "", new_ans_hw)
- new_ans_hw = re.sub(r"^(.*)([\u4e00-\u9fa5])\2", r"\1\2", new_ans_hw) # 例:速率率
- if re.search("为$", new_ans_hw) and pos_tag_han(re.sub("为+$", "", new_ans_hw)) == 'v':
- new_ans_hw = re.sub("为+$", "", new_ans_hw)
- elif re.search("^不", new_ans_hw) and pos_tag_han(re.sub("^不", "", new_ans_hw)) == 'a':
- new_ans_hw = re.sub("^不", "", new_ans_hw)
- new_ans_hw = re.sub(r"^不([\u4e00-\u9fa5]{2,})", r"\1", new_ans_hw)
- if len(new_ans_hw) > 1:
- # 错别字判断
- may_candidate = auto_check.auto_correct(new_ans_hw, flag="all")
- print("new_ans_hw:{}---may_candidate:::{}".format(new_ans_hw, may_candidate))
- if may_candidate and new_ans_hw not in may_candidate:
- # logger.info("含错别字:{}-{}\nmay_candidate:{}".format(ans_hw, new_ans_hw, may_candidate))
- if len(may_candidate) <= 5:
- if len(new_ans_hw) == 2 and re.search("的$", new_ans_hw):
- continue
- if len(new_ans_hw) > 2:
- new_ans_hw2 = re.sub(r"^(绝|开始|才|最|刚+|在)", "", new_ans_hw)
- new_ans_hw2 = re.sub(r"会$", "", new_ans_hw2)
- if new_ans_hw2 != new_ans_hw and len(new_ans_hw2) > 1:
- may_candidate = auto_check.auto_correct(new_ans_hw2, flag="all")
- if may_candidate and new_ans_hw2 not in may_candidate:
- pass
- else:
- continue
- elif len(new_ans_hw) >= 4:
- jieba_tok = jieba.lcut(new_ans_hw)
- may_right = []
- for tok in jieba_tok:
- may_candidate = auto_check.auto_correct(tok, flag="all")
- if may_candidate and tok not in may_candidate:
- may_right.append(0)
- else:
- may_right.append(1)
- break
- if sum(may_right)/len(may_right) == 1:
- continue
- else:
- return 0.91, char_similar_score, new_ans_hw
- return 0, char_similar_score, new_ans_hw
- # 字符级相似度
- if ans_hw not in ans_list and all([True if len(ans_hw) <= 4 and cos_sim(ans, ans_hw) < 0.2
- else False for ans in ans_list]):
- char_similar_score = 0
- # if may_candidate != new_ans_hw and may_candidate not in ans_list and
- # cos_sim(may_candidate, new_ans_hw) < 0.8: # 暂时,例“每一个”与“每个”
- # return 0
- return 1, char_similar_score, ""
- # 暂时先不考虑参考答案不全
- def KM_cidian_check(text):
- """
- KM词典,判断中文词语是否存在
- :param text:
- :return: 1:存在 0:不存在
- """
- word = str(text).strip() # .replace(" ", "%20").replace("!", " ") # .replace(".", "")
- try:
- r = requests.get("https://kmcha.com/cidian/{}".format(word), timeout=0.5) # 费时间
- except:
- return 0
- text = r.content.decode('utf8')
- # print(text)
- if re.search("<strong>.{2,4}释义</strong>", text):
- return 1
- elif re.search("<p>[\s\n]*<a href=.*?</a>[\s\n]*</p>[\s\n]*</div>", text, flags=re.S) is None:
- return 1
- else:
- related_ci = []
- a_info = re.findall("<div>[\s\n]*<p>[\s\n]*<strong>相关查询</strong>[\s\n]*</p>[\s\n]*<p>(.+?)</p>[\s\n]*</div>",
- text, flags=re.S)
- if a_info:
- related_ci.extend(re.findall("<a href=[^>]*?>(.+?)</a>", a_info[0]))
- print("related_ci:", related_ci)
- # if word not in related_ci:
- # return 0
- # else:
- new_related_ci = related_ci[:-2]
- print(new_related_ci)
- # if len(text) <= 2:
- related_ci_text = ";".join(new_related_ci)
- if word in new_related_ci or word in related_ci_text:
- return 1
- elif len(text) <= 3:
- return 0
- else:
- exist_w = ""
- idx = 0
- for i in range(len(word)):
- if word[i] in related_ci_text:
- exist_w += word[i]
- idx += 1
- else:
- break
- if len(exist_w) >= 2 and len(word[idx:]) >= 2:
- from Autochecker4Chinese import auto_check
- may_candidate = auto_check.auto_correct(word[idx:], flag="all")
- if may_candidate and word[idx:] in may_candidate:
- return 1
- else:
- return 0
- else:
- return 0
- # return 0
- def youdao_cidian_check(text):
- """
- 1:存在 0:不存在
- :return:
- """
- try:
- r = requests.get("http://dict.youdao.com/w/{}/#keyfrom=dict2.top".format(text), timeout=0.5)
- except:
- return 0
- all_content = r.content.decode('utf-8')
- # print(all_content)
- if "您要找的是不是:" in all_content:
- return 0
- return 1
- def check_by_bigdata(txt):
- """
- 通过大数据搜索判断搭配是否常见==>判断正误
- :return:
- """
- db_file = r'F:\zwj\WiKi\wiki_zh.db'
- conn_seg1 = sqlite3.connect(database="F:\zwj\WiKi\wiki_segs_zh_1.db")
- conn_seg2 = sqlite3.connect(database="F:\zwj\WiKi\wiki_segs_zh_2.db")
- conn_seg3 = sqlite3.connect(database="F:\zwj\WiKi\wiki_segs_zh_3.db")
- cur1 = conn_seg1.cursor()
- cur2 = conn_seg2.cursor()
- cur3 = conn_seg3.cursor()
- def search(ss, mod=0): # mod=0:无需返回查询结果
- res = []
- cur1.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
- res1 = cur1.fetchone()
- if mod == 1:
- if res1:
- res.extend(res1[1].split(","))
- cur2.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
- res2 = cur2.fetchone()
- if res2:
- res.extend(res2[1].split(","))
- cur3.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
- res3 = cur3.fetchone()
- if res3:
- res.extend(res3[1].split(","))
- res = set(res)
- return res
- else:
- if not cur1.fetchone():
- cur2.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
- if not cur2.fetchone():
- cur3.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
- if not cur3.fetchone():
- return 0
- return 1
- def text_seg(txt):
- segs = jieba.lcut(txt.strip(), cut_all=False) # 精确模式
- f_stop = open(r'G:\zwj\WL\en2cn\files\main\stopwords.txt', encoding='utf-8') # 自己的中文停用词表
- stopwords = [line.strip() for line in f_stop] # strip() 方法用于移除字符串头尾指定的字符(默认为空格)
- f_stop.close()
- segs = [re.sub("^[a-zA-Z\d_]+|[a-zA-Z\d_]+$", "", wl) for wl in segs]
- new_segs = [s.strip() for s in segs if
- len(s) > 1 and re.search("[\u4e00-\u9fa5]", s) and s.strip() not in stopwords]
- # if len(segs) >= 2: # 组合效果不太好
- # for n in range(len(segs) - 1):
- # new_segs.append(segs[n] + segs[n + 1])
- return new_segs
- term_data = search(txt.strip())
- if not term_data:
- segs = text_seg(txt)
- if segs:
- # print(segs)
- comid = search(segs[0], mod=1)
- if len(segs) > 1:
- for seg in segs[1:]:
- t_id = search(seg, mod=1)
- comid = comid.intersection(t_id)
- if not comid:
- return 0
- elif len(comid) > 5:
- # print(comid)
- return -1 # 不确定
- else:
- conn = sqlite3.connect(database=db_file)
- cur = conn.cursor()
- sourcetxt = ""
- for id in comid:
- cur.execute("SELECT * FROM wiki_zh_table WHERE seg_id=?", (id,))
- restxt = cur.fetchone()
- if restxt:
- sourcetxt += restxt[1]
- cur.close()
- if txt not in sourcetxt:
- return 0
- elif not comid:
- return 0
- cur1.close()
- cur2.close()
- cur3.close()
- return 1
- if __name__ == '__main__':
- import time
- mean_hw_list = []
- ans_list = []
- # err_mean_judge(mean_hw_list, ans_list)
- # a = KM_cidian_check("想要做某事")
- # print(a)
- # a = youdao_cidian_check("掌上明住")
- # print(a)
- t1 = time.time()
- # b = check_by_bigdata("录背室")
- # b = youdao_cidian_check("录背室")
- b = KM_cidian_check("录背室")
- # b = youdao_cidian_check("以出认可")
- print(b)
- print(time.time()-t1)
|