cdZWj
/
en2cn


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
							# -*- coding: utf-8 -*-
import re
import jieba  # jieba分词
import requests
from Utils.util import cos_sim
from Utils.han_semantic_similarity import pos_tag_han
import sqlite3

# logger = my_config.myLog(__name__, log_cate="misswritten_log").getlog()


def err_mean_judge(mean_hw_list, ans_list):
    """
    错误意思或相反意思判断
    加上一个词典辅助判断
    :param :
    :return:
    """
    from Autochecker4Chinese import auto_check

    char_similar_score = 1
    for ans_hw in mean_hw_list:
        # 1、先判断错别字
        # corrected_sent, detail = pycorrector.correct('ans_hw')  # 需下载的模型太大
        # print(auto_check.auto_correct(ans_hw))
        if "…" in ans_hw:
            ans_hw = re.sub(r"^.*?…+的?(.+)", r"\1", ans_hw)
        new_ans_hw = re.sub(r"^(.*)([\u4e00-\u9fa5])\2[地了的得等]", r"\1", ans_hw)  # 例：信心满满的
        new_ans_hw = re.sub(r"^(.{2,})[地了的得等]$", r"\1", new_ans_hw)
        new_ans_hw = re.sub(r"^(.+)出[来去]$", r"\1", new_ans_hw)
        new_ans_hw = re.sub(r"^(向前|得到|不是|继续|[使得有我令对让向将好]+)", "", new_ans_hw)
        new_ans_hw = re.sub(r"^的+|^(想要|正在)", "", new_ans_hw)
        new_ans_hw = re.sub(r"^未([\u4e00-\u9fa5]{2,})", r"\1", new_ans_hw)
        new_ans_hw = re.sub("某人[\u4e00-\u9fa5]+某事", "", new_ans_hw)
        new_ans_hw = re.sub("某[人物事]", "", new_ans_hw)
        new_ans_hw = re.sub(r"^(.*)([\u4e00-\u9fa5])\2", r"\1\2", new_ans_hw)  # 例：速率率
        if re.search("为$", new_ans_hw) and pos_tag_han(re.sub("为+$", "", new_ans_hw)) == 'v':
            new_ans_hw = re.sub("为+$", "", new_ans_hw)
        elif re.search("^不", new_ans_hw) and pos_tag_han(re.sub("^不", "", new_ans_hw)) == 'a':
            new_ans_hw = re.sub("^不", "", new_ans_hw)
        new_ans_hw = re.sub(r"^不([\u4e00-\u9fa5]{2,})", r"\1", new_ans_hw)

        if len(new_ans_hw) > 1:
            # 错别字判断
            may_candidate = auto_check.auto_correct(new_ans_hw, flag="all")
            print("new_ans_hw：{}---may_candidate:::{}".format(new_ans_hw, may_candidate))
            if may_candidate and new_ans_hw not in may_candidate:
                # logger.info("含错别字：{}-{}\nmay_candidate:{}".format(ans_hw, new_ans_hw, may_candidate))
                if len(may_candidate) <= 5:
                    if len(new_ans_hw) == 2 and re.search("的$", new_ans_hw):
                        continue
                    if len(new_ans_hw) > 2:
                        new_ans_hw2 = re.sub(r"^(绝|开始|才|最|刚+|在)", "", new_ans_hw)
                        new_ans_hw2 = re.sub(r"会$", "", new_ans_hw2)
                        if new_ans_hw2 != new_ans_hw and len(new_ans_hw2) > 1:
                            may_candidate = auto_check.auto_correct(new_ans_hw2, flag="all")
                            if may_candidate and new_ans_hw2 not in may_candidate:
                                pass
                            else:
                                continue
                        elif len(new_ans_hw) >= 4:
                            jieba_tok = jieba.lcut(new_ans_hw)
                            may_right = []
                            for tok in jieba_tok:
                                may_candidate = auto_check.auto_correct(tok, flag="all")
                                if may_candidate and tok not in may_candidate:
                                    may_right.append(0)
                                else:
                                    may_right.append(1)
                                    break
                            if sum(may_right)/len(may_right) == 1:
                                continue
                else:
                    return 0.91, char_similar_score, new_ans_hw
                return 0, char_similar_score, new_ans_hw

            # 字符级相似度
            if ans_hw not in ans_list and all([True if len(ans_hw) <= 4 and cos_sim(ans, ans_hw) < 0.2
                                               else False for ans in ans_list]):
                char_similar_score = 0
            # if may_candidate != new_ans_hw and may_candidate not in ans_list and
            # cos_sim(may_candidate, new_ans_hw) < 0.8:  # 暂时，例“每一个”与“每个”
            #     return 0
    return 1, char_similar_score, ""
    # 暂时先不考虑参考答案不全


def KM_cidian_check(text):
    """
    KM词典，判断中文词语是否存在
    :param text:
    :return: 1:存在   0：不存在
    """
    word = str(text).strip()  # .replace(" ", "%20").replace("!", " ")  # .replace(".", "")
    try:
        r = requests.get("https://kmcha.com/cidian/{}".format(word), timeout=0.5)  # 费时间
    except:
        return 0
    text = r.content.decode('utf8')

    # print(text)
    if re.search("<strong>.{2,4}释义</strong>", text):
        return 1
    elif re.search("<p>[\s\n]*<a href=.*?</a>[\s\n]*</p>[\s\n]*</div>", text, flags=re.S) is None:
        return 1
    else:
        related_ci = []
        a_info = re.findall("<div>[\s\n]*<p>[\s\n]*<strong>相关查询</strong>[\s\n]*</p>[\s\n]*<p>(.+?)</p>[\s\n]*</div>",
                            text, flags=re.S)
        if a_info:
            related_ci.extend(re.findall("<a href=[^>]*?>(.+?)</a>", a_info[0]))
        print("related_ci:", related_ci)
        # if word not in related_ci:
        #     return 0
        # else:
        new_related_ci = related_ci[:-2]
        print(new_related_ci)
        # if len(text) <= 2:
        related_ci_text = ";".join(new_related_ci)
        if word in new_related_ci or word in related_ci_text:
            return 1
        elif len(text) <= 3:
            return 0
        else:
            exist_w = ""
            idx = 0
            for i in range(len(word)):
                if word[i] in related_ci_text:
                    exist_w += word[i]
                    idx += 1
                else:
                    break
            if len(exist_w) >= 2 and len(word[idx:]) >= 2:
                from Autochecker4Chinese import auto_check
                may_candidate = auto_check.auto_correct(word[idx:], flag="all")
                if may_candidate and word[idx:] in may_candidate:
                    return 1
                else:
                    return 0
            else:
                return 0
    # return 0


def youdao_cidian_check(text):
    """
    1:存在   0：不存在
    :return:
    """
    try:
        r = requests.get("http://dict.youdao.com/w/{}/#keyfrom=dict2.top".format(text), timeout=0.5)
    except:
        return 0
    all_content = r.content.decode('utf-8')
    # print(all_content)
    if "您要找的是不是:" in all_content:
        return 0
    return 1


def check_by_bigdata(txt):
    """
    通过大数据搜索判断搭配是否常见==>判断正误
    :return:
    """
    db_file = r'F:\zwj\WiKi\wiki_zh.db'

    conn_seg1 = sqlite3.connect(database="F:\zwj\WiKi\wiki_segs_zh_1.db")
    conn_seg2 = sqlite3.connect(database="F:\zwj\WiKi\wiki_segs_zh_2.db")
    conn_seg3 = sqlite3.connect(database="F:\zwj\WiKi\wiki_segs_zh_3.db")
    cur1 = conn_seg1.cursor()
    cur2 = conn_seg2.cursor()
    cur3 = conn_seg3.cursor()

    def search(ss, mod=0):  # mod=0:无需返回查询结果
        res = []
        cur1.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
        res1 = cur1.fetchone()
        if mod == 1:
            if res1:
                res.extend(res1[1].split(","))
            cur2.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
            res2 = cur2.fetchone()
            if res2:
                res.extend(res2[1].split(","))
            cur3.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
            res3 = cur3.fetchone()
            if res3:
                res.extend(res3[1].split(","))
            res = set(res)
            return res
        else:
            if not cur1.fetchone():
                cur2.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
                if not cur2.fetchone():
                    cur3.execute("SELECT * FROM seg_table WHERE seg=?", (ss,))
                    if not cur3.fetchone():
                        return 0
            return 1

    def text_seg(txt):
        segs = jieba.lcut(txt.strip(), cut_all=False)  # 精确模式
        f_stop = open(r'G:\zwj\WL\en2cn\files\main\stopwords.txt', encoding='utf-8')  # 自己的中文停用词表
        stopwords = [line.strip() for line in f_stop]  # strip() 方法用于移除字符串头尾指定的字符（默认为空格）
        f_stop.close()
        segs = [re.sub("^[a-zA-Z\d_]+|[a-zA-Z\d_]+$", "", wl) for wl in segs]
        new_segs = [s.strip() for s in segs if
                    len(s) > 1 and re.search("[\u4e00-\u9fa5]", s) and s.strip() not in stopwords]
        # if len(segs) >= 2:  # 组合效果不太好
        #     for n in range(len(segs) - 1):
        #         new_segs.append(segs[n] + segs[n + 1])
        return new_segs

    term_data = search(txt.strip())
    if not term_data:
        segs = text_seg(txt)
        if segs:
            # print(segs)
            comid = search(segs[0], mod=1)
            if len(segs) > 1:
                for seg in segs[1:]:
                    t_id = search(seg, mod=1)
                    comid = comid.intersection(t_id)
                if not comid:
                    return 0
                elif len(comid) > 5:
                    # print(comid)
                    return -1  # 不确定
                else:
                    conn = sqlite3.connect(database=db_file)
                    cur = conn.cursor()
                    sourcetxt = ""
                    for id in comid:
                        cur.execute("SELECT * FROM wiki_zh_table WHERE seg_id=?", (id,))
                        restxt = cur.fetchone()
                        if restxt:
                            sourcetxt += restxt[1]
                    cur.close()
                    if txt not in sourcetxt:
                        return 0
            elif not comid:
                return 0
    cur1.close()
    cur2.close()
    cur3.close()

    return 1


if __name__ == '__main__':
    import time
    mean_hw_list = []
    ans_list = []
    # err_mean_judge(mean_hw_list, ans_list)
    # a = KM_cidian_check("想要做某事")
    # print(a)
    # a = youdao_cidian_check("掌上明住")
    # print(a)
    t1 = time.time()
    # b = check_by_bigdata("录背室")
    # b = youdao_cidian_check("录背室")
    b = KM_cidian_check("录背室")
    # b = youdao_cidian_check("以出认可")
    print(b)
    print(time.time()-t1)