cdZWj
/
en2cn


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
							# -*- coding: utf-8 -*-

import re
import jieba  # jieba分词
import time
# import difflib  # 方法一：Python自带标准库计算相似度的方法，可直接用
# from fuzzywuzzy import fuzz  # 方法二：Python自带标准库计算相似度的方法，可直接用
import numpy as np
from collections import Counter
import pandas as pd
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
# import multiprocessing
# import threading


# 计算两个中文语句的相似度
# 方法三：编辑距离，又称Levenshtein距离
def edit_similar(str1, str2):  # str1，str2是分词后的标签列表
    len_str1 = len(str1)
    len_str2 = len(str2)
    taglist = np.zeros((len_str1 + 1, len_str2 + 1))
    for a in range(len_str1):
        taglist[a][0] = a
    for a in range(len_str2):
        taglist[0][a] = a
    for i in range(1, len_str1 + 1):
        for j in range(1, len_str2 + 1):
            if str1[i - 1] == str2[j - 1]:
                temp = 0
            else:
                temp = 1
            taglist[i][j] = min(taglist[i - 1][j - 1] + temp, taglist[i][j - 1] + 1, taglist[i - 1][j] + 1)
    return 1 - taglist[len_str1][len_str2] / max(len_str1, len_str2)


# 方法四：余弦相似度
def cos_sim(str1, str2):  # str1，str2是分词后的标签列表
    co_str1 = (Counter(str1))
    co_str2 = (Counter(str2))
    p_str1 = []
    p_str2 = []
    for temp in set(str1 + str2):
        p_str1.append(co_str1[temp])
        p_str2.append(co_str2[temp])
    p_str1 = np.array(p_str1)
    p_str2 = np.array(p_str2)
    return p_str1.dot(p_str2) / (np.sqrt(p_str1.dot(p_str1)) * np.sqrt(p_str2.dot(p_str2)))


def misspelled_words_detect():
    """
    错别字检测
    pip install pycorrector
    kenlm安装:
    pip install https://github.com/kpu/kenlm/archive/master.zip
    规则方法默认会从路径~/.pycorrector/datasets/zh_giga.no_cna_cmn.prune01244.klm加载kenlm语言模型文件，如果检测没有该文件，则程序会自动联网下载
    :return:
    """
    # 错别字检测
    # import pycorrector
    # corrected_sent, detail = pycorrector.correct('让坐')
    # print(corrected_sent, detail)


def means_split(item):
    """
    单词、短语的词义切分
    英语单词词性：n.   v.   num.   adj.   adv.    pron.    prep.   art.   conj.  int.
    :return:
    """
    item = re.sub("<.+?>|[（(][A-Za-z\s]+[)）]|\[[a-z.]+\]", "", item)
    item = re.sub("(\s*…\s*)+", "…", item)
    mean_list = re.split("\s+|[；，,;]|[a-z]+\.", item)
    mean_list = [mean.strip() for mean in mean_list if mean.strip()]
    mean_list = list(set(mean_list))
    return mean_list


def words_classify_from_textbook(path, source, word2mean):
    """
    整理中学课本的单词
    :param path:
    :param source:
    word2mean: {}
    :return:
    """
    sheets = ["必修一", "必修二", "必修三", "选择性必修一", "选择性必修二", "选择性必修三", "选择性必修四"]
    if "旧" in source:
        sheets = ["必修一", "必修二", "必修三", "必修四", "必修五", "选修六", "选修七", "选修八"]
    for sheet in sheets:
        df1 = pd.read_excel(path, sheet_name=sheet)
        print(source+sheet)
        # word2mean = dict(zip(df1["单词"], df1["词性词义"]))
        for i, row in df1.iterrows():
            # print(row["单词"])
            word = re.sub(r"^\*", "", row["单词"].strip())
            mean = row["词性词义"].replace("\n", "<br>")
            if word in word2mean:
                word2mean[word].update({source+sheet: mean})
            else:
                word2mean[word] = {source + sheet: mean}
                # print('"{}": {{"mean": "{}", "source": "{}{}"}},'.format(
                #     word, row["词性词义"].replace("\n", "；<br>"), source, sheet))
    return word2mean


def word_screen(one_d):
    """
    筛选含（），/  =  的单词
    :return:
    one_d: {"":{"":""}}
    """
    for k, v in one_d.items():
        # if re.search("[(（）)\[\]/=]|缩写", k) or \
        #         re.search("[(（）)\[\]/=]|缩写", ";".join(list(v.values()))):
        #     print("{}: {},".format(k, v))
        if re.search("=|缩写", ";".join(list(v.values()))):
            print("'{}': {},".format(k, v))
        # a = [
        #     'good/bad-tempered',
        #     'in good/bad shape',
        #     'in the short/long term',
        #     'switch off/on',
        #     "turn one's back (on sb/sth)",
        #     'the International Olympic Committee/IOC',
        #     'no longer/not…any longer', 'in good/poor condition', 'in addition (to sb/sth)', "easy-going/i:zi:'gəuiŋ/",
        #     'break away (from sb/sth)',
        #     'be / feel in the mood (for sth. / to do sth.)',
        #     '(be) true of / for',
        #     'a great/good many',
        #     'absorbed in sth/sb',
        #     'commit oneself to (sth/doing sth/do sth)',
        #     'analyse [NAmE -ze]',
        #     '(be) bound to (do) …',
        #     'be bound to (do) …',
        #     '(be) bound to …',
        #     'set (a play, novel, etc.) in',
        #     'pi（π）',
        #     'pin (on)',
        #     '2D（2-dimensional）',
        #     'AI (artificial intelligence)',
        #     'AR(Augmented Reality)',
        #     'MR(Mixed Reality)',
        #     'PhD (Doctor of Philosophy)',
        #     'VR（Virtual Reality)',
        #     '(at) first hand',
        #     '(be) allergic to',
        #     'am（ante meridiem）',
        #     "when the cat's away (the mice will play)",
        #
        # ]
        # if re.search("[(（）)]", k) and k not in a:
        #     k = re.sub("\s*/\s*", "/", k).replace("（", "(").replace("）", ")")
        #     # k1 = re.sub("/[a-z]+$|/[a-z]+(?=\s)", "", k)
        #     # k2 = re.sub("(?<=\s)[a-z]+/|^[a-z]+/", "", k)
        #     k4 = ""
        #     if "(be)" in k:
        #         k3 = re.sub("\(be\)", " be ", k).replace("  ", " ").strip()
        #     else:
        #         k3 = re.sub("\(.*?\)", " ", k).replace("  ", " ").strip()
        #         k4 = re.sub(r"\((.*?)\)", r" \1 ", k).replace("  ", " ").strip()
        #     print("'{}': {},".format(k, v))
        #     print("'{}': {},".format(k3, v))
        #     if k4:
        #         print("'{}': {},".format(k4, v))
        #     # print("---------------------------")


def phrase_classify(en_word):
    """
    短语分类
    :return:
    """
    text = word_tokenize(en_word)
    ptag = pos_tag(text)
    # print("phrase_classify:::::", ptag)
    if ptag:
        if len(en_word.split(" ")) > 1:
            if ptag[0][1] in ["VB", "V"]:
                return "v-phrase"
            if ptag[0][1] == "IN":
                return "prep-phrase"
            if len(ptag) == 2 and ptag[1][1] == "NN" and ptag[0][1] in ["NN", "ADJ", "JJ", "RB"]:
                return "n-phrase"
            if "NN+IN+NN" in "+".join([i[1] for i in ptag]):
                return "n-phrase"
        else:
            return ptag[0][-1]


# （重写）MyThread.py线程类，使其能够返回值
# class MyThread(threading.Thread):
#     def __init__(self, func, args=(), kwargs=None):
#         super(MyThread, self).__init__()
#         self.func = func
#         self.args = args
#         self.kwargs = kwargs
#
#     # 重写后的run()方法不再执行以前的run()方法了
#     # 注意:即使加了return也不会返回值,如return self.func(*self.args)
#     def run(self):
#         if self.kwargs:
#             self.result = self.func(self.kwargs["arg1"], self.kwargs["arg2"])
#         else:
#             self.result = self.func(*self.args)
#
#     def get_result(self):
#         # return self.result
#         # 必须等待线程执行完毕,如果线程还未执行完毕就去获取result是没有结果的
#         threading.Thread.join(self)
#         try:
#             return self.result
#         except Exception:
#             return None


# 词性标注THULAC：http://thulac.thunlp.org/demo


if __name__ == '__main__':
    from pprint import pprint
    # 举例说明
    # str1 = "现在什么时候了"
    # str2 = "什么时候了现在"
    str1 = "A highly recommended book"
    str2 = "a highly recommendable book"
    str11 = jieba.lcut(str1)
    str22 = jieba.lcut(str2)
    # print('str1=' + str1)  # jieba分词后
    # print(str11)  # jieba分词后
    # diff_result = difflib.SequenceMatcher(None, str1, str2).ratio()
    # print('方法一：Python标准库difflib的计算分值：' + str(diff_result))
    # print('方法二：Python标准库fuzz的计算分值：' + str(fuzz.ratio(str1, str2) / 100))
    st1 = time.time()
    print('方法三：编辑距离的计算分值：' + str(edit_similar(str11, str22)),"tt1:",time.time()-st1)
    st2 = time.time()
    # print('方法四：余弦相似度的计算分值：' + str(cos_sim(str1, str2)), "tt2:", time.time()-st2)

    # ---------------------------------------------
    # edition = ["新人教", "新外研", "新牛津", "新北师大", "旧人教", "旧外研", "旧牛津", "旧北师大"]
    # # edition = ["旧北师大"]
    # word2mean_dict = {}
    # for edit in edition:
    #     path = r"G:\zwj\WL\en2cn\files\教材义\教材单元汇总表格\{}单词全册.xlsx".format(edit)
    #     res = words_classify_from_textbook(path, edit, word2mean_dict)
    #     word2mean_dict.update(res)
    # pprint(word2mean_dict)
    # ---------------------------------------------------------
    # path = r"G:\zwj\WL\en2cn\files\教材义\初中考纲1600词【词汇+词性词义】.xlsx"
    # df1 = pd.read_excel(path)
    # word2mean = {}
    # for i, row in df1.iterrows():
    #     # print(row["单词"])
    #     word = re.sub(r"^\*", "", row["词汇"].strip())
    #     mean = row["词性词义"].replace("\n", "<br>").replace("\ue009", " ")
    #     if word in word2mean:
    #         print(word)
    #     else:
    #         word2mean[word] = mean
    # pprint(word2mean)
    # ----------------------------------------------------
    from Words.word_dict_from_textbook import word2mean_high
    # from Words.Phrase_dict import phrases_dict_bing
    # # word_screen(word2mean_high)
    # for k, v in phrases_dict_bing.items():
    #     zh_full_wash(v, source="bing")
    #  ----------------------------------------------
    # text = word_tokenize('categorise')
    # ptag = pos_tag(text)
    # print(ptag)
    # t1 = time.time()
    # phrase_classify('evaporate')
    # print(time.time() - t1)
    # from Words.Phrase_dict import phrases_dict_tk
    # for k, v in phrases_dict_tk.items():
    #     if "  " in k:
    #         print(k)
    # ----------------根据批改数据将错误意思汇总----------------
    from Words.Phrase_dict import errmean_en_dict
    # path = r"G:\zwj\WL\en2cn\files\复评文件\英译汉自动批改9-29.xlsx"
    # df = pd.read_excel(path, sheet_name="Sheet1")
    # df = df.dropna(subset="错误意思", axis=0)
    # # print(df["错误意思"])
    # # errmean_word = {}
    # for i, row in df.iterrows():
    #     # print(row["单词"], row["错误意思"])
    #     enword = row["单词"].strip()
    #     errmean = re.sub("\.\s*$", "", row["错误意思"]).strip()
    #     if enword not in errmean_en_dict:
    #         errmean_en_dict[enword] = errmean
    #     else:
    #         for j in re.split("[;；]", errmean):
    #             if j not in errmean_en_dict[enword] and (";"+j) not in errmean_en_dict[enword]:
    #                 errmean_en_dict[enword] += ";" + j
    # pprint(errmean_en_dict)