123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304 |
- # -*- coding: utf-8 -*-
- import re
- import jieba # jieba分词
- import time
- # import difflib # 方法一:Python自带标准库计算相似度的方法,可直接用
- # from fuzzywuzzy import fuzz # 方法二:Python自带标准库计算相似度的方法,可直接用
- import numpy as np
- from collections import Counter
- import pandas as pd
- from nltk.tag import pos_tag
- from nltk.tokenize import word_tokenize
- # import multiprocessing
- # import threading
- # 计算两个中文语句的相似度
- # 方法三:编辑距离,又称Levenshtein距离
- def edit_similar(str1, str2): # str1,str2是分词后的标签列表
- len_str1 = len(str1)
- len_str2 = len(str2)
- taglist = np.zeros((len_str1 + 1, len_str2 + 1))
- for a in range(len_str1):
- taglist[a][0] = a
- for a in range(len_str2):
- taglist[0][a] = a
- for i in range(1, len_str1 + 1):
- for j in range(1, len_str2 + 1):
- if str1[i - 1] == str2[j - 1]:
- temp = 0
- else:
- temp = 1
- taglist[i][j] = min(taglist[i - 1][j - 1] + temp, taglist[i][j - 1] + 1, taglist[i - 1][j] + 1)
- return 1 - taglist[len_str1][len_str2] / max(len_str1, len_str2)
- # 方法四:余弦相似度
- def cos_sim(str1, str2): # str1,str2是分词后的标签列表
- co_str1 = (Counter(str1))
- co_str2 = (Counter(str2))
- p_str1 = []
- p_str2 = []
- for temp in set(str1 + str2):
- p_str1.append(co_str1[temp])
- p_str2.append(co_str2[temp])
- p_str1 = np.array(p_str1)
- p_str2 = np.array(p_str2)
- return p_str1.dot(p_str2) / (np.sqrt(p_str1.dot(p_str1)) * np.sqrt(p_str2.dot(p_str2)))
- def misspelled_words_detect():
- """
- 错别字检测
- pip install pycorrector
- kenlm安装:
- pip install https://github.com/kpu/kenlm/archive/master.zip
- 规则方法默认会从路径~/.pycorrector/datasets/zh_giga.no_cna_cmn.prune01244.klm加载kenlm语言模型文件,如果检测没有该文件,则程序会自动联网下载
- :return:
- """
- # 错别字检测
- # import pycorrector
- # corrected_sent, detail = pycorrector.correct('让坐')
- # print(corrected_sent, detail)
- def means_split(item):
- """
- 单词、短语的词义切分
- 英语单词词性:n. v. num. adj. adv. pron. prep. art. conj. int.
- :return:
- """
- item = re.sub("<.+?>|[((][A-Za-z\s]+[))]|\[[a-z.]+\]", "", item)
- item = re.sub("(\s*…\s*)+", "…", item)
- mean_list = re.split("\s+|[;,,;]|[a-z]+\.", item)
- mean_list = [mean.strip() for mean in mean_list if mean.strip()]
- mean_list = list(set(mean_list))
- return mean_list
- def words_classify_from_textbook(path, source, word2mean):
- """
- 整理中学课本的单词
- :param path:
- :param source:
- word2mean: {}
- :return:
- """
- sheets = ["必修一", "必修二", "必修三", "选择性必修一", "选择性必修二", "选择性必修三", "选择性必修四"]
- if "旧" in source:
- sheets = ["必修一", "必修二", "必修三", "必修四", "必修五", "选修六", "选修七", "选修八"]
- for sheet in sheets:
- df1 = pd.read_excel(path, sheet_name=sheet)
- print(source+sheet)
- # word2mean = dict(zip(df1["单词"], df1["词性词义"]))
- for i, row in df1.iterrows():
- # print(row["单词"])
- word = re.sub(r"^\*", "", row["单词"].strip())
- mean = row["词性词义"].replace("\n", "<br>")
- if word in word2mean:
- word2mean[word].update({source+sheet: mean})
- else:
- word2mean[word] = {source + sheet: mean}
- # print('"{}": {{"mean": "{}", "source": "{}{}"}},'.format(
- # word, row["词性词义"].replace("\n", ";<br>"), source, sheet))
- return word2mean
- def word_screen(one_d):
- """
- 筛选含(),/ = 的单词
- :return:
- one_d: {"":{"":""}}
- """
- for k, v in one_d.items():
- # if re.search("[(())\[\]/=]|缩写", k) or \
- # re.search("[(())\[\]/=]|缩写", ";".join(list(v.values()))):
- # print("{}: {},".format(k, v))
- if re.search("=|缩写", ";".join(list(v.values()))):
- print("'{}': {},".format(k, v))
- # a = [
- # 'good/bad-tempered',
- # 'in good/bad shape',
- # 'in the short/long term',
- # 'switch off/on',
- # "turn one's back (on sb/sth)",
- # 'the International Olympic Committee/IOC',
- # 'no longer/not…any longer', 'in good/poor condition', 'in addition (to sb/sth)', "easy-going/i:zi:'gəuiŋ/",
- # 'break away (from sb/sth)',
- # 'be / feel in the mood (for sth. / to do sth.)',
- # '(be) true of / for',
- # 'a great/good many',
- # 'absorbed in sth/sb',
- # 'commit oneself to (sth/doing sth/do sth)',
- # 'analyse [NAmE -ze]',
- # '(be) bound to (do) …',
- # 'be bound to (do) …',
- # '(be) bound to …',
- # 'set (a play, novel, etc.) in',
- # 'pi(π)',
- # 'pin (on)',
- # '2D(2-dimensional)',
- # 'AI (artificial intelligence)',
- # 'AR(Augmented Reality)',
- # 'MR(Mixed Reality)',
- # 'PhD (Doctor of Philosophy)',
- # 'VR(Virtual Reality)',
- # '(at) first hand',
- # '(be) allergic to',
- # 'am(ante meridiem)',
- # "when the cat's away (the mice will play)",
- #
- # ]
- # if re.search("[(())]", k) and k not in a:
- # k = re.sub("\s*/\s*", "/", k).replace("(", "(").replace(")", ")")
- # # k1 = re.sub("/[a-z]+$|/[a-z]+(?=\s)", "", k)
- # # k2 = re.sub("(?<=\s)[a-z]+/|^[a-z]+/", "", k)
- # k4 = ""
- # if "(be)" in k:
- # k3 = re.sub("\(be\)", " be ", k).replace(" ", " ").strip()
- # else:
- # k3 = re.sub("\(.*?\)", " ", k).replace(" ", " ").strip()
- # k4 = re.sub(r"\((.*?)\)", r" \1 ", k).replace(" ", " ").strip()
- # print("'{}': {},".format(k, v))
- # print("'{}': {},".format(k3, v))
- # if k4:
- # print("'{}': {},".format(k4, v))
- # # print("---------------------------")
- def phrase_classify(en_word):
- """
- 短语分类
- :return:
- """
- text = word_tokenize(en_word)
- ptag = pos_tag(text)
- # print("phrase_classify:::::", ptag)
- if ptag:
- if len(en_word.split(" ")) > 1:
- if ptag[0][1] in ["VB", "V"]:
- return "v-phrase"
- if ptag[0][1] == "IN":
- return "prep-phrase"
- if len(ptag) == 2 and ptag[1][1] == "NN" and ptag[0][1] in ["NN", "ADJ", "JJ", "RB"]:
- return "n-phrase"
- if "NN+IN+NN" in "+".join([i[1] for i in ptag]):
- return "n-phrase"
- else:
- return ptag[0][-1]
- # (重写)MyThread.py线程类,使其能够返回值
- # class MyThread(threading.Thread):
- # def __init__(self, func, args=(), kwargs=None):
- # super(MyThread, self).__init__()
- # self.func = func
- # self.args = args
- # self.kwargs = kwargs
- #
- # # 重写后的run()方法不再执行以前的run()方法了
- # # 注意:即使加了return也不会返回值,如return self.func(*self.args)
- # def run(self):
- # if self.kwargs:
- # self.result = self.func(self.kwargs["arg1"], self.kwargs["arg2"])
- # else:
- # self.result = self.func(*self.args)
- #
- # def get_result(self):
- # # return self.result
- # # 必须等待线程执行完毕,如果线程还未执行完毕就去获取result是没有结果的
- # threading.Thread.join(self)
- # try:
- # return self.result
- # except Exception:
- # return None
- # 词性标注THULAC:http://thulac.thunlp.org/demo
- if __name__ == '__main__':
- from pprint import pprint
- # 举例说明
- # str1 = "现在什么时候了"
- # str2 = "什么时候了现在"
- str1 = "A highly recommended book"
- str2 = "a highly recommendable book"
- str11 = jieba.lcut(str1)
- str22 = jieba.lcut(str2)
- # print('str1=' + str1) # jieba分词后
- # print(str11) # jieba分词后
- # diff_result = difflib.SequenceMatcher(None, str1, str2).ratio()
- # print('方法一:Python标准库difflib的计算分值:' + str(diff_result))
- # print('方法二:Python标准库fuzz的计算分值:' + str(fuzz.ratio(str1, str2) / 100))
- st1 = time.time()
- print('方法三:编辑距离的计算分值:' + str(edit_similar(str11, str22)),"tt1:",time.time()-st1)
- st2 = time.time()
- # print('方法四:余弦相似度的计算分值:' + str(cos_sim(str1, str2)), "tt2:", time.time()-st2)
- # ---------------------------------------------
- # edition = ["新人教", "新外研", "新牛津", "新北师大", "旧人教", "旧外研", "旧牛津", "旧北师大"]
- # # edition = ["旧北师大"]
- # word2mean_dict = {}
- # for edit in edition:
- # path = r"G:\zwj\WL\en2cn\files\教材义\教材单元汇总表格\{}单词全册.xlsx".format(edit)
- # res = words_classify_from_textbook(path, edit, word2mean_dict)
- # word2mean_dict.update(res)
- # pprint(word2mean_dict)
- # ---------------------------------------------------------
- # path = r"G:\zwj\WL\en2cn\files\教材义\初中考纲1600词【词汇+词性词义】.xlsx"
- # df1 = pd.read_excel(path)
- # word2mean = {}
- # for i, row in df1.iterrows():
- # # print(row["单词"])
- # word = re.sub(r"^\*", "", row["词汇"].strip())
- # mean = row["词性词义"].replace("\n", "<br>").replace("\ue009", " ")
- # if word in word2mean:
- # print(word)
- # else:
- # word2mean[word] = mean
- # pprint(word2mean)
- # ----------------------------------------------------
- from Words.word_dict_from_textbook import word2mean_high
- # from Words.Phrase_dict import phrases_dict_bing
- # # word_screen(word2mean_high)
- # for k, v in phrases_dict_bing.items():
- # zh_full_wash(v, source="bing")
- # ----------------------------------------------
- # text = word_tokenize('categorise')
- # ptag = pos_tag(text)
- # print(ptag)
- # t1 = time.time()
- # phrase_classify('evaporate')
- # print(time.time() - t1)
- # from Words.Phrase_dict import phrases_dict_tk
- # for k, v in phrases_dict_tk.items():
- # if " " in k:
- # print(k)
- # ----------------根据批改数据将错误意思汇总----------------
- from Words.Phrase_dict import errmean_en_dict
- # path = r"G:\zwj\WL\en2cn\files\复评文件\英译汉自动批改9-29.xlsx"
- # df = pd.read_excel(path, sheet_name="Sheet1")
- # df = df.dropna(subset="错误意思", axis=0)
- # # print(df["错误意思"])
- # # errmean_word = {}
- # for i, row in df.iterrows():
- # # print(row["单词"], row["错误意思"])
- # enword = row["单词"].strip()
- # errmean = re.sub("\.\s*$", "", row["错误意思"]).strip()
- # if enword not in errmean_en_dict:
- # errmean_en_dict[enword] = errmean
- # else:
- # for j in re.split("[;;]", errmean):
- # if j not in errmean_en_dict[enword] and (";"+j) not in errmean_en_dict[enword]:
- # errmean_en_dict[enword] += ";" + j
- # pprint(errmean_en_dict)
|