# -*- coding: utf-8 -*-
import re
import jieba # jieba分词
import time
# import difflib # 方法一:Python自带标准库计算相似度的方法,可直接用
# from fuzzywuzzy import fuzz # 方法二:Python自带标准库计算相似度的方法,可直接用
import numpy as np
from collections import Counter
import pandas as pd
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
# import multiprocessing
# import threading
# 计算两个中文语句的相似度
# 方法三:编辑距离,又称Levenshtein距离
def edit_similar(str1, str2): # str1,str2是分词后的标签列表
len_str1 = len(str1)
len_str2 = len(str2)
taglist = np.zeros((len_str1 + 1, len_str2 + 1))
for a in range(len_str1):
taglist[a][0] = a
for a in range(len_str2):
taglist[0][a] = a
for i in range(1, len_str1 + 1):
for j in range(1, len_str2 + 1):
if str1[i - 1] == str2[j - 1]:
temp = 0
else:
temp = 1
taglist[i][j] = min(taglist[i - 1][j - 1] + temp, taglist[i][j - 1] + 1, taglist[i - 1][j] + 1)
return 1 - taglist[len_str1][len_str2] / max(len_str1, len_str2)
# 方法四:余弦相似度
def cos_sim(str1, str2): # str1,str2是分词后的标签列表
co_str1 = (Counter(str1))
co_str2 = (Counter(str2))
p_str1 = []
p_str2 = []
for temp in set(str1 + str2):
p_str1.append(co_str1[temp])
p_str2.append(co_str2[temp])
p_str1 = np.array(p_str1)
p_str2 = np.array(p_str2)
return p_str1.dot(p_str2) / (np.sqrt(p_str1.dot(p_str1)) * np.sqrt(p_str2.dot(p_str2)))
def misspelled_words_detect():
"""
错别字检测
pip install pycorrector
kenlm安装:
pip install https://github.com/kpu/kenlm/archive/master.zip
规则方法默认会从路径~/.pycorrector/datasets/zh_giga.no_cna_cmn.prune01244.klm加载kenlm语言模型文件,如果检测没有该文件,则程序会自动联网下载
:return:
"""
# 错别字检测
# import pycorrector
# corrected_sent, detail = pycorrector.correct('让坐')
# print(corrected_sent, detail)
def means_split(item):
"""
单词、短语的词义切分
英语单词词性:n. v. num. adj. adv. pron. prep. art. conj. int.
:return:
"""
item = re.sub("<.+?>|[((][A-Za-z\s]+[))]|\[[a-z.]+\]", "", item)
item = re.sub("(\s*…\s*)+", "…", item)
mean_list = re.split("\s+|[;,,;]|[a-z]+\.", item)
mean_list = [mean.strip() for mean in mean_list if mean.strip()]
mean_list = list(set(mean_list))
return mean_list
def words_classify_from_textbook(path, source, word2mean):
"""
整理中学课本的单词
:param path:
:param source:
word2mean: {}
:return:
"""
sheets = ["必修一", "必修二", "必修三", "选择性必修一", "选择性必修二", "选择性必修三", "选择性必修四"]
if "旧" in source:
sheets = ["必修一", "必修二", "必修三", "必修四", "必修五", "选修六", "选修七", "选修八"]
for sheet in sheets:
df1 = pd.read_excel(path, sheet_name=sheet)
print(source+sheet)
# word2mean = dict(zip(df1["单词"], df1["词性词义"]))
for i, row in df1.iterrows():
# print(row["单词"])
word = re.sub(r"^\*", "", row["单词"].strip())
mean = row["词性词义"].replace("\n", "
")
if word in word2mean:
word2mean[word].update({source+sheet: mean})
else:
word2mean[word] = {source + sheet: mean}
# print('"{}": {{"mean": "{}", "source": "{}{}"}},'.format(
# word, row["词性词义"].replace("\n", ";
"), source, sheet))
return word2mean
def word_screen(one_d):
"""
筛选含(),/ = 的单词
:return:
one_d: {"":{"":""}}
"""
for k, v in one_d.items():
# if re.search("[(())\[\]/=]|缩写", k) or \
# re.search("[(())\[\]/=]|缩写", ";".join(list(v.values()))):
# print("{}: {},".format(k, v))
if re.search("=|缩写", ";".join(list(v.values()))):
print("'{}': {},".format(k, v))
# a = [
# 'good/bad-tempered',
# 'in good/bad shape',
# 'in the short/long term',
# 'switch off/on',
# "turn one's back (on sb/sth)",
# 'the International Olympic Committee/IOC',
# 'no longer/not…any longer', 'in good/poor condition', 'in addition (to sb/sth)', "easy-going/i:zi:'gəuiŋ/",
# 'break away (from sb/sth)',
# 'be / feel in the mood (for sth. / to do sth.)',
# '(be) true of / for',
# 'a great/good many',
# 'absorbed in sth/sb',
# 'commit oneself to (sth/doing sth/do sth)',
# 'analyse [NAmE -ze]',
# '(be) bound to (do) …',
# 'be bound to (do) …',
# '(be) bound to …',
# 'set (a play, novel, etc.) in',
# 'pi(π)',
# 'pin (on)',
# '2D(2-dimensional)',
# 'AI (artificial intelligence)',
# 'AR(Augmented Reality)',
# 'MR(Mixed Reality)',
# 'PhD (Doctor of Philosophy)',
# 'VR(Virtual Reality)',
# '(at) first hand',
# '(be) allergic to',
# 'am(ante meridiem)',
# "when the cat's away (the mice will play)",
#
# ]
# if re.search("[(())]", k) and k not in a:
# k = re.sub("\s*/\s*", "/", k).replace("(", "(").replace(")", ")")
# # k1 = re.sub("/[a-z]+$|/[a-z]+(?=\s)", "", k)
# # k2 = re.sub("(?<=\s)[a-z]+/|^[a-z]+/", "", k)
# k4 = ""
# if "(be)" in k:
# k3 = re.sub("\(be\)", " be ", k).replace(" ", " ").strip()
# else:
# k3 = re.sub("\(.*?\)", " ", k).replace(" ", " ").strip()
# k4 = re.sub(r"\((.*?)\)", r" \1 ", k).replace(" ", " ").strip()
# print("'{}': {},".format(k, v))
# print("'{}': {},".format(k3, v))
# if k4:
# print("'{}': {},".format(k4, v))
# # print("---------------------------")
def phrase_classify(en_word):
"""
短语分类
:return:
"""
text = word_tokenize(en_word)
ptag = pos_tag(text)
# print("phrase_classify:::::", ptag)
if ptag:
if len(en_word.split(" ")) > 1:
if ptag[0][1] in ["VB", "V"]:
return "v-phrase"
if ptag[0][1] == "IN":
return "prep-phrase"
if len(ptag) == 2 and ptag[1][1] == "NN" and ptag[0][1] in ["NN", "ADJ", "JJ", "RB"]:
return "n-phrase"
if "NN+IN+NN" in "+".join([i[1] for i in ptag]):
return "n-phrase"
else:
return ptag[0][-1]
# (重写)MyThread.py线程类,使其能够返回值
# class MyThread(threading.Thread):
# def __init__(self, func, args=(), kwargs=None):
# super(MyThread, self).__init__()
# self.func = func
# self.args = args
# self.kwargs = kwargs
#
# # 重写后的run()方法不再执行以前的run()方法了
# # 注意:即使加了return也不会返回值,如return self.func(*self.args)
# def run(self):
# if self.kwargs:
# self.result = self.func(self.kwargs["arg1"], self.kwargs["arg2"])
# else:
# self.result = self.func(*self.args)
#
# def get_result(self):
# # return self.result
# # 必须等待线程执行完毕,如果线程还未执行完毕就去获取result是没有结果的
# threading.Thread.join(self)
# try:
# return self.result
# except Exception:
# return None
# 词性标注THULAC:http://thulac.thunlp.org/demo
if __name__ == '__main__':
from pprint import pprint
# 举例说明
# str1 = "现在什么时候了"
# str2 = "什么时候了现在"
str1 = "A highly recommended book"
str2 = "a highly recommendable book"
str11 = jieba.lcut(str1)
str22 = jieba.lcut(str2)
# print('str1=' + str1) # jieba分词后
# print(str11) # jieba分词后
# diff_result = difflib.SequenceMatcher(None, str1, str2).ratio()
# print('方法一:Python标准库difflib的计算分值:' + str(diff_result))
# print('方法二:Python标准库fuzz的计算分值:' + str(fuzz.ratio(str1, str2) / 100))
st1 = time.time()
print('方法三:编辑距离的计算分值:' + str(edit_similar(str11, str22)),"tt1:",time.time()-st1)
st2 = time.time()
# print('方法四:余弦相似度的计算分值:' + str(cos_sim(str1, str2)), "tt2:", time.time()-st2)
# ---------------------------------------------
# edition = ["新人教", "新外研", "新牛津", "新北师大", "旧人教", "旧外研", "旧牛津", "旧北师大"]
# # edition = ["旧北师大"]
# word2mean_dict = {}
# for edit in edition:
# path = r"G:\zwj\WL\en2cn\files\教材义\教材单元汇总表格\{}单词全册.xlsx".format(edit)
# res = words_classify_from_textbook(path, edit, word2mean_dict)
# word2mean_dict.update(res)
# pprint(word2mean_dict)
# ---------------------------------------------------------
# path = r"G:\zwj\WL\en2cn\files\教材义\初中考纲1600词【词汇+词性词义】.xlsx"
# df1 = pd.read_excel(path)
# word2mean = {}
# for i, row in df1.iterrows():
# # print(row["单词"])
# word = re.sub(r"^\*", "", row["词汇"].strip())
# mean = row["词性词义"].replace("\n", "
").replace("\ue009", " ")
# if word in word2mean:
# print(word)
# else:
# word2mean[word] = mean
# pprint(word2mean)
# ----------------------------------------------------
from Words.word_dict_from_textbook import word2mean_high
# from Words.Phrase_dict import phrases_dict_bing
# # word_screen(word2mean_high)
# for k, v in phrases_dict_bing.items():
# zh_full_wash(v, source="bing")
# ----------------------------------------------
# text = word_tokenize('categorise')
# ptag = pos_tag(text)
# print(ptag)
# t1 = time.time()
# phrase_classify('evaporate')
# print(time.time() - t1)
# from Words.Phrase_dict import phrases_dict_tk
# for k, v in phrases_dict_tk.items():
# if " " in k:
# print(k)
# ----------------根据批改数据将错误意思汇总----------------
from Words.Phrase_dict import errmean_en_dict
# path = r"G:\zwj\WL\en2cn\files\复评文件\英译汉自动批改9-29.xlsx"
# df = pd.read_excel(path, sheet_name="Sheet1")
# df = df.dropna(subset="错误意思", axis=0)
# # print(df["错误意思"])
# # errmean_word = {}
# for i, row in df.iterrows():
# # print(row["单词"], row["错误意思"])
# enword = row["单词"].strip()
# errmean = re.sub("\.\s*$", "", row["错误意思"]).strip()
# if enword not in errmean_en_dict:
# errmean_en_dict[enword] = errmean
# else:
# for j in re.split("[;;]", errmean):
# if j not in errmean_en_dict[enword] and (";"+j) not in errmean_en_dict[enword]:
# errmean_en_dict[enword] += ";" + j
# pprint(errmean_en_dict)