# -*- coding: utf-8 -*- import re import jieba # jieba分词 import time # import difflib # 方法一:Python自带标准库计算相似度的方法,可直接用 # from fuzzywuzzy import fuzz # 方法二:Python自带标准库计算相似度的方法,可直接用 import numpy as np from collections import Counter import pandas as pd from nltk.tag import pos_tag from nltk.tokenize import word_tokenize # import multiprocessing # import threading # 计算两个中文语句的相似度 # 方法三:编辑距离,又称Levenshtein距离 def edit_similar(str1, str2): # str1,str2是分词后的标签列表 len_str1 = len(str1) len_str2 = len(str2) taglist = np.zeros((len_str1 + 1, len_str2 + 1)) for a in range(len_str1): taglist[a][0] = a for a in range(len_str2): taglist[0][a] = a for i in range(1, len_str1 + 1): for j in range(1, len_str2 + 1): if str1[i - 1] == str2[j - 1]: temp = 0 else: temp = 1 taglist[i][j] = min(taglist[i - 1][j - 1] + temp, taglist[i][j - 1] + 1, taglist[i - 1][j] + 1) return 1 - taglist[len_str1][len_str2] / max(len_str1, len_str2) # 方法四:余弦相似度 def cos_sim(str1, str2): # str1,str2是分词后的标签列表 co_str1 = (Counter(str1)) co_str2 = (Counter(str2)) p_str1 = [] p_str2 = [] for temp in set(str1 + str2): p_str1.append(co_str1[temp]) p_str2.append(co_str2[temp]) p_str1 = np.array(p_str1) p_str2 = np.array(p_str2) return p_str1.dot(p_str2) / (np.sqrt(p_str1.dot(p_str1)) * np.sqrt(p_str2.dot(p_str2))) def misspelled_words_detect(): """ 错别字检测 pip install pycorrector kenlm安装: pip install https://github.com/kpu/kenlm/archive/master.zip 规则方法默认会从路径~/.pycorrector/datasets/zh_giga.no_cna_cmn.prune01244.klm加载kenlm语言模型文件,如果检测没有该文件,则程序会自动联网下载 :return: """ # 错别字检测 # import pycorrector # corrected_sent, detail = pycorrector.correct('让坐') # print(corrected_sent, detail) def means_split(item): """ 单词、短语的词义切分 英语单词词性:n. v. num. adj. adv. pron. prep. art. conj. int. :return: """ item = re.sub("<.+?>|[((][A-Za-z\s]+[))]|\[[a-z.]+\]", "", item) item = re.sub("(\s*…\s*)+", "…", item) mean_list = re.split("\s+|[;,,;]|[a-z]+\.", item) mean_list = [mean.strip() for mean in mean_list if mean.strip()] mean_list = list(set(mean_list)) return mean_list def words_classify_from_textbook(path, source, word2mean): """ 整理中学课本的单词 :param path: :param source: word2mean: {} :return: """ sheets = ["必修一", "必修二", "必修三", "选择性必修一", "选择性必修二", "选择性必修三", "选择性必修四"] if "旧" in source: sheets = ["必修一", "必修二", "必修三", "必修四", "必修五", "选修六", "选修七", "选修八"] for sheet in sheets: df1 = pd.read_excel(path, sheet_name=sheet) print(source+sheet) # word2mean = dict(zip(df1["单词"], df1["词性词义"])) for i, row in df1.iterrows(): # print(row["单词"]) word = re.sub(r"^\*", "", row["单词"].strip()) mean = row["词性词义"].replace("\n", "
") if word in word2mean: word2mean[word].update({source+sheet: mean}) else: word2mean[word] = {source + sheet: mean} # print('"{}": {{"mean": "{}", "source": "{}{}"}},'.format( # word, row["词性词义"].replace("\n", ";
"), source, sheet)) return word2mean def word_screen(one_d): """ 筛选含(),/ = 的单词 :return: one_d: {"":{"":""}} """ for k, v in one_d.items(): # if re.search("[(())\[\]/=]|缩写", k) or \ # re.search("[(())\[\]/=]|缩写", ";".join(list(v.values()))): # print("{}: {},".format(k, v)) if re.search("=|缩写", ";".join(list(v.values()))): print("'{}': {},".format(k, v)) # a = [ # 'good/bad-tempered', # 'in good/bad shape', # 'in the short/long term', # 'switch off/on', # "turn one's back (on sb/sth)", # 'the International Olympic Committee/IOC', # 'no longer/not…any longer', 'in good/poor condition', 'in addition (to sb/sth)', "easy-going/i:zi:'gəuiŋ/", # 'break away (from sb/sth)', # 'be / feel in the mood (for sth. / to do sth.)', # '(be) true of / for', # 'a great/good many', # 'absorbed in sth/sb', # 'commit oneself to (sth/doing sth/do sth)', # 'analyse [NAmE -ze]', # '(be) bound to (do) …', # 'be bound to (do) …', # '(be) bound to …', # 'set (a play, novel, etc.) in', # 'pi(π)', # 'pin (on)', # '2D(2-dimensional)', # 'AI (artificial intelligence)', # 'AR(Augmented Reality)', # 'MR(Mixed Reality)', # 'PhD (Doctor of Philosophy)', # 'VR(Virtual Reality)', # '(at) first hand', # '(be) allergic to', # 'am(ante meridiem)', # "when the cat's away (the mice will play)", # # ] # if re.search("[(())]", k) and k not in a: # k = re.sub("\s*/\s*", "/", k).replace("(", "(").replace(")", ")") # # k1 = re.sub("/[a-z]+$|/[a-z]+(?=\s)", "", k) # # k2 = re.sub("(?<=\s)[a-z]+/|^[a-z]+/", "", k) # k4 = "" # if "(be)" in k: # k3 = re.sub("\(be\)", " be ", k).replace(" ", " ").strip() # else: # k3 = re.sub("\(.*?\)", " ", k).replace(" ", " ").strip() # k4 = re.sub(r"\((.*?)\)", r" \1 ", k).replace(" ", " ").strip() # print("'{}': {},".format(k, v)) # print("'{}': {},".format(k3, v)) # if k4: # print("'{}': {},".format(k4, v)) # # print("---------------------------") def phrase_classify(en_word): """ 短语分类 :return: """ text = word_tokenize(en_word) ptag = pos_tag(text) # print("phrase_classify:::::", ptag) if ptag: if len(en_word.split(" ")) > 1: if ptag[0][1] in ["VB", "V"]: return "v-phrase" if ptag[0][1] == "IN": return "prep-phrase" if len(ptag) == 2 and ptag[1][1] == "NN" and ptag[0][1] in ["NN", "ADJ", "JJ", "RB"]: return "n-phrase" if "NN+IN+NN" in "+".join([i[1] for i in ptag]): return "n-phrase" else: return ptag[0][-1] # (重写)MyThread.py线程类,使其能够返回值 # class MyThread(threading.Thread): # def __init__(self, func, args=(), kwargs=None): # super(MyThread, self).__init__() # self.func = func # self.args = args # self.kwargs = kwargs # # # 重写后的run()方法不再执行以前的run()方法了 # # 注意:即使加了return也不会返回值,如return self.func(*self.args) # def run(self): # if self.kwargs: # self.result = self.func(self.kwargs["arg1"], self.kwargs["arg2"]) # else: # self.result = self.func(*self.args) # # def get_result(self): # # return self.result # # 必须等待线程执行完毕,如果线程还未执行完毕就去获取result是没有结果的 # threading.Thread.join(self) # try: # return self.result # except Exception: # return None # 词性标注THULAC:http://thulac.thunlp.org/demo if __name__ == '__main__': from pprint import pprint # 举例说明 # str1 = "现在什么时候了" # str2 = "什么时候了现在" str1 = "A highly recommended book" str2 = "a highly recommendable book" str11 = jieba.lcut(str1) str22 = jieba.lcut(str2) # print('str1=' + str1) # jieba分词后 # print(str11) # jieba分词后 # diff_result = difflib.SequenceMatcher(None, str1, str2).ratio() # print('方法一:Python标准库difflib的计算分值:' + str(diff_result)) # print('方法二:Python标准库fuzz的计算分值:' + str(fuzz.ratio(str1, str2) / 100)) st1 = time.time() print('方法三:编辑距离的计算分值:' + str(edit_similar(str11, str22)),"tt1:",time.time()-st1) st2 = time.time() # print('方法四:余弦相似度的计算分值:' + str(cos_sim(str1, str2)), "tt2:", time.time()-st2) # --------------------------------------------- # edition = ["新人教", "新外研", "新牛津", "新北师大", "旧人教", "旧外研", "旧牛津", "旧北师大"] # # edition = ["旧北师大"] # word2mean_dict = {} # for edit in edition: # path = r"G:\zwj\WL\en2cn\files\教材义\教材单元汇总表格\{}单词全册.xlsx".format(edit) # res = words_classify_from_textbook(path, edit, word2mean_dict) # word2mean_dict.update(res) # pprint(word2mean_dict) # --------------------------------------------------------- # path = r"G:\zwj\WL\en2cn\files\教材义\初中考纲1600词【词汇+词性词义】.xlsx" # df1 = pd.read_excel(path) # word2mean = {} # for i, row in df1.iterrows(): # # print(row["单词"]) # word = re.sub(r"^\*", "", row["词汇"].strip()) # mean = row["词性词义"].replace("\n", "
").replace("\ue009", " ") # if word in word2mean: # print(word) # else: # word2mean[word] = mean # pprint(word2mean) # ---------------------------------------------------- from Words.word_dict_from_textbook import word2mean_high # from Words.Phrase_dict import phrases_dict_bing # # word_screen(word2mean_high) # for k, v in phrases_dict_bing.items(): # zh_full_wash(v, source="bing") # ---------------------------------------------- # text = word_tokenize('categorise') # ptag = pos_tag(text) # print(ptag) # t1 = time.time() # phrase_classify('evaporate') # print(time.time() - t1) # from Words.Phrase_dict import phrases_dict_tk # for k, v in phrases_dict_tk.items(): # if " " in k: # print(k) # ----------------根据批改数据将错误意思汇总---------------- from Words.Phrase_dict import errmean_en_dict # path = r"G:\zwj\WL\en2cn\files\复评文件\英译汉自动批改9-29.xlsx" # df = pd.read_excel(path, sheet_name="Sheet1") # df = df.dropna(subset="错误意思", axis=0) # # print(df["错误意思"]) # # errmean_word = {} # for i, row in df.iterrows(): # # print(row["单词"], row["错误意思"]) # enword = row["单词"].strip() # errmean = re.sub("\.\s*$", "", row["错误意思"]).strip() # if enword not in errmean_en_dict: # errmean_en_dict[enword] = errmean # else: # for j in re.split("[;;]", errmean): # if j not in errmean_en_dict[enword] and (";"+j) not in errmean_en_dict[enword]: # errmean_en_dict[enword] += ";" + j # pprint(errmean_en_dict)