# -*- coding: utf-8 -*-
import re
import jieba # jieba分词
import requests
from Utils.util import cos_sim
from Utils.han_semantic_similarity import pos_tag_han
import sqlite3
# logger = my_config.myLog(__name__, log_cate="misswritten_log").getlog()
def err_mean_judge(mean_hw_list, ans_list):
"""
错误意思或相反意思判断
加上一个词典辅助判断
:param :
:return:
"""
from Autochecker4Chinese import auto_check
char_similar_score = 1
for ans_hw in mean_hw_list:
# 1、先判断错别字
# corrected_sent, detail = pycorrector.correct('ans_hw') # 需下载的模型太大
# print(auto_check.auto_correct(ans_hw))
if "…" in ans_hw:
ans_hw = re.sub(r"^.*?…+的?(.+)", r"\1", ans_hw)
new_ans_hw = re.sub(r"^(.*)([\u4e00-\u9fa5])\2[地了的得等]", r"\1", ans_hw) # 例:信心满满的
new_ans_hw = re.sub(r"^(.{2,})[地了的得等]$", r"\1", new_ans_hw)
new_ans_hw = re.sub(r"^(.+)出[来去]$", r"\1", new_ans_hw)
new_ans_hw = re.sub(r"^(向前|得到|不是|继续|[使得有我令对让向将好]+)", "", new_ans_hw)
new_ans_hw = re.sub(r"^的+|^(想要|正在)", "", new_ans_hw)
new_ans_hw = re.sub(r"^未([\u4e00-\u9fa5]{2,})", r"\1", new_ans_hw)
new_ans_hw = re.sub("某人[\u4e00-\u9fa5]+某事", "", new_ans_hw)
new_ans_hw = re.sub("某[人物事]", "", new_ans_hw)
new_ans_hw = re.sub(r"^(.*)([\u4e00-\u9fa5])\2", r"\1\2", new_ans_hw) # 例:速率率
if re.search("为$", new_ans_hw) and pos_tag_han(re.sub("为+$", "", new_ans_hw)) == 'v':
new_ans_hw = re.sub("为+$", "", new_ans_hw)
elif re.search("^不", new_ans_hw) and pos_tag_han(re.sub("^不", "", new_ans_hw)) == 'a':
new_ans_hw = re.sub("^不", "", new_ans_hw)
new_ans_hw = re.sub(r"^不([\u4e00-\u9fa5]{2,})", r"\1", new_ans_hw)
if len(new_ans_hw) > 1:
# 错别字判断
may_candidate = auto_check.auto_correct(new_ans_hw, flag="all")
print("new_ans_hw:{}---may_candidate:::{}".format(new_ans_hw, may_candidate))
if may_candidate and new_ans_hw not in may_candidate:
# logger.info("含错别字:{}-{}\nmay_candidate:{}".format(ans_hw, new_ans_hw, may_candidate))
if len(may_candidate) <= 5:
if len(new_ans_hw) == 2 and re.search("的$", new_ans_hw):
continue
if len(new_ans_hw) > 2:
new_ans_hw2 = re.sub(r"^(绝|开始|才|最|刚+|在)", "", new_ans_hw)
new_ans_hw2 = re.sub(r"会$", "", new_ans_hw2)
if new_ans_hw2 != new_ans_hw and len(new_ans_hw2) > 1:
may_candidate = auto_check.auto_correct(new_ans_hw2, flag="all")
if may_candidate and new_ans_hw2 not in may_candidate:
pass
else:
continue
elif len(new_ans_hw) >= 4:
jieba_tok = jieba.lcut(new_ans_hw)
may_right = []
for tok in jieba_tok:
may_candidate = auto_check.auto_correct(tok, flag="all")
if may_candidate and tok not in may_candidate:
may_right.append(0)
else:
may_right.append(1)
break
if sum(may_right)/len(may_right) == 1:
continue
else:
return 0.91, char_similar_score, new_ans_hw
return 0, char_similar_score, new_ans_hw
# 字符级相似度
if ans_hw not in ans_list and all([True if len(ans_hw) <= 4 and cos_sim(ans, ans_hw) < 0.2
else False for ans in ans_list]):
char_similar_score = 0
# if may_candidate != new_ans_hw and may_candidate not in ans_list and
# cos_sim(may_candidate, new_ans_hw) < 0.8: # 暂时,例“每一个”与“每个”
# return 0
return 1, char_similar_score, ""
# 暂时先不考虑参考答案不全
def KM_cidian_check(text):
"""
KM词典,判断中文词语是否存在
:param text:
:return: 1:存在 0:不存在
"""
word = str(text).strip() # .replace(" ", "%20").replace("!", " ") # .replace(".", "")
try:
r = requests.get("https://kmcha.com/cidian/{}".format(word), timeout=0.5) # 费时间
except:
return 0
text = r.content.decode('utf8')
# print(text)
if re.search(".{2,4}释义", text):
return 1
elif re.search("
[\s\n]*[\s\n]*
[\s\n]*", text, flags=re.S) is None:
return 1
else:
related_ci = []
a_info = re.findall("[\s\n]*
[\s\n]*相关查询[\s\n]*
[\s\n]*
(.+?)
[\s\n]*
",
text, flags=re.S)
if a_info:
related_ci.extend(re.findall("]*?>(.+?)", a_info[0]))
print("related_ci:", related_ci)
# if word not in related_ci:
# return 0
# else:
new_related_ci = related_ci[:-2]
print(new_related_ci)
# if len(text) <= 2:
related_ci_text = ";".join(new_related_ci)
if word in new_related_ci or word in related_ci_text:
return 1
elif len(text) <= 3:
return 0
else:
exist_w = ""
idx = 0
for i in range(len(word)):
if word[i] in related_ci_text:
exist_w += word[i]
idx += 1
else:
break
if len(exist_w) >= 2 and len(word[idx:]) >= 2:
from Autochecker4Chinese import auto_check
may_candidate = auto_check.auto_correct(word[idx:], flag="all")
if may_candidate and word[idx:] in may_candidate:
return 1
else:
return 0
else:
return 0
# return 0
def youdao_cidian_check(text):
"""
1:存在 0:不存在
:return:
"""
try:
r = requests.get("http://dict.youdao.com/w/{}/#keyfrom=dict2.top".format(text), timeout=0.5)
except:
return 0
all_content = r.content.decode('utf-8')
# print(all_content)
if "您要找的是不是:" in all_content:
return 0
return 1
def check_by_bigdata(txt):
"""
通过大数据搜索判断搭配是否常见==>判断正误
:return:
"""
db_file = r'F:\zwj\WiKi\wiki_zh.db'
conn_seg1 = sqlite3.connect(database="F:\zwj\WiKi\wiki_segs_zh_1.db")
conn_seg2 = sqlite3.connect(database="F:\zwj\WiKi\wiki_segs_zh_2.db")
conn_seg3 = sqlite3.connect(database="F:\zwj\WiKi\wiki_segs_zh_3.db")
cur1 = conn_seg1.cursor()
cur2 = conn_seg2.cursor()
cur3 = conn_seg3.cursor()
def search(ss, mod=0): # mod=0:无需返回查询结果
res = []
cur1.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
res1 = cur1.fetchone()
if mod == 1:
if res1:
res.extend(res1[1].split(","))
cur2.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
res2 = cur2.fetchone()
if res2:
res.extend(res2[1].split(","))
cur3.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
res3 = cur3.fetchone()
if res3:
res.extend(res3[1].split(","))
res = set(res)
return res
else:
if not cur1.fetchone():
cur2.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
if not cur2.fetchone():
cur3.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
if not cur3.fetchone():
return 0
return 1
def text_seg(txt):
segs = jieba.lcut(txt.strip(), cut_all=False) # 精确模式
f_stop = open(r'G:\zwj\WL\en2cn\files\main\stopwords.txt', encoding='utf-8') # 自己的中文停用词表
stopwords = [line.strip() for line in f_stop] # strip() 方法用于移除字符串头尾指定的字符(默认为空格)
f_stop.close()
segs = [re.sub("^[a-zA-Z\d_]+|[a-zA-Z\d_]+$", "", wl) for wl in segs]
new_segs = [s.strip() for s in segs if
len(s) > 1 and re.search("[\u4e00-\u9fa5]", s) and s.strip() not in stopwords]
# if len(segs) >= 2: # 组合效果不太好
# for n in range(len(segs) - 1):
# new_segs.append(segs[n] + segs[n + 1])
return new_segs
term_data = search(txt.strip())
if not term_data:
segs = text_seg(txt)
if segs:
# print(segs)
comid = search(segs[0], mod=1)
if len(segs) > 1:
for seg in segs[1:]:
t_id = search(seg, mod=1)
comid = comid.intersection(t_id)
if not comid:
return 0
elif len(comid) > 5:
# print(comid)
return -1 # 不确定
else:
conn = sqlite3.connect(database=db_file)
cur = conn.cursor()
sourcetxt = ""
for id in comid:
cur.execute("SELECT * FROM wiki_zh_table WHERE seg_id=?", (id,))
restxt = cur.fetchone()
if restxt:
sourcetxt += restxt[1]
cur.close()
if txt not in sourcetxt:
return 0
elif not comid:
return 0
cur1.close()
cur2.close()
cur3.close()
return 1
if __name__ == '__main__':
import time
mean_hw_list = []
ans_list = []
# err_mean_judge(mean_hw_list, ans_list)
# a = KM_cidian_check("想要做某事")
# print(a)
# a = youdao_cidian_check("掌上明住")
# print(a)
t1 = time.time()
# b = check_by_bigdata("录背室")
# b = youdao_cidian_check("录背室")
b = KM_cidian_check("录背室")
# b = youdao_cidian_check("以出认可")
print(b)
print(time.time()-t1)