util.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. # -*- coding: utf-8 -*-
  2. import re
  3. import jieba # jieba分词
  4. import time
  5. # import difflib # 方法一:Python自带标准库计算相似度的方法,可直接用
  6. # from fuzzywuzzy import fuzz # 方法二:Python自带标准库计算相似度的方法,可直接用
  7. import numpy as np
  8. from collections import Counter
  9. import pandas as pd
  10. from nltk.tag import pos_tag
  11. from nltk.tokenize import word_tokenize
  12. # import multiprocessing
  13. # import threading
  14. # 计算两个中文语句的相似度
  15. # 方法三:编辑距离,又称Levenshtein距离
  16. def edit_similar(str1, str2): # str1,str2是分词后的标签列表
  17. len_str1 = len(str1)
  18. len_str2 = len(str2)
  19. taglist = np.zeros((len_str1 + 1, len_str2 + 1))
  20. for a in range(len_str1):
  21. taglist[a][0] = a
  22. for a in range(len_str2):
  23. taglist[0][a] = a
  24. for i in range(1, len_str1 + 1):
  25. for j in range(1, len_str2 + 1):
  26. if str1[i - 1] == str2[j - 1]:
  27. temp = 0
  28. else:
  29. temp = 1
  30. taglist[i][j] = min(taglist[i - 1][j - 1] + temp, taglist[i][j - 1] + 1, taglist[i - 1][j] + 1)
  31. return 1 - taglist[len_str1][len_str2] / max(len_str1, len_str2)
  32. # 方法四:余弦相似度
  33. def cos_sim(str1, str2): # str1,str2是分词后的标签列表
  34. co_str1 = (Counter(str1))
  35. co_str2 = (Counter(str2))
  36. p_str1 = []
  37. p_str2 = []
  38. for temp in set(str1 + str2):
  39. p_str1.append(co_str1[temp])
  40. p_str2.append(co_str2[temp])
  41. p_str1 = np.array(p_str1)
  42. p_str2 = np.array(p_str2)
  43. return p_str1.dot(p_str2) / (np.sqrt(p_str1.dot(p_str1)) * np.sqrt(p_str2.dot(p_str2)))
  44. def misspelled_words_detect():
  45. """
  46. 错别字检测
  47. pip install pycorrector
  48. kenlm安装:
  49. pip install https://github.com/kpu/kenlm/archive/master.zip
  50. 规则方法默认会从路径~/.pycorrector/datasets/zh_giga.no_cna_cmn.prune01244.klm加载kenlm语言模型文件,如果检测没有该文件,则程序会自动联网下载
  51. :return:
  52. """
  53. # 错别字检测
  54. # import pycorrector
  55. # corrected_sent, detail = pycorrector.correct('让坐')
  56. # print(corrected_sent, detail)
  57. def means_split(item):
  58. """
  59. 单词、短语的词义切分
  60. 英语单词词性:n. v. num. adj. adv. pron. prep. art. conj. int.
  61. :return:
  62. """
  63. item = re.sub("<.+?>|[((][A-Za-z\s]+[))]|\[[a-z.]+\]", "", item)
  64. item = re.sub("(\s*…\s*)+", "…", item)
  65. mean_list = re.split("\s+|[;,,;]|[a-z]+\.", item)
  66. mean_list = [mean.strip() for mean in mean_list if mean.strip()]
  67. mean_list = list(set(mean_list))
  68. return mean_list
  69. def words_classify_from_textbook(path, source, word2mean):
  70. """
  71. 整理中学课本的单词
  72. :param path:
  73. :param source:
  74. word2mean: {}
  75. :return:
  76. """
  77. sheets = ["必修一", "必修二", "必修三", "选择性必修一", "选择性必修二", "选择性必修三", "选择性必修四"]
  78. if "旧" in source:
  79. sheets = ["必修一", "必修二", "必修三", "必修四", "必修五", "选修六", "选修七", "选修八"]
  80. for sheet in sheets:
  81. df1 = pd.read_excel(path, sheet_name=sheet)
  82. print(source+sheet)
  83. # word2mean = dict(zip(df1["单词"], df1["词性词义"]))
  84. for i, row in df1.iterrows():
  85. # print(row["单词"])
  86. word = re.sub(r"^\*", "", row["单词"].strip())
  87. mean = row["词性词义"].replace("\n", "<br>")
  88. if word in word2mean:
  89. word2mean[word].update({source+sheet: mean})
  90. else:
  91. word2mean[word] = {source + sheet: mean}
  92. # print('"{}": {{"mean": "{}", "source": "{}{}"}},'.format(
  93. # word, row["词性词义"].replace("\n", ";<br>"), source, sheet))
  94. return word2mean
  95. def word_screen(one_d):
  96. """
  97. 筛选含(),/ = 的单词
  98. :return:
  99. one_d: {"":{"":""}}
  100. """
  101. for k, v in one_d.items():
  102. # if re.search("[(())\[\]/=]|缩写", k) or \
  103. # re.search("[(())\[\]/=]|缩写", ";".join(list(v.values()))):
  104. # print("{}: {},".format(k, v))
  105. if re.search("=|缩写", ";".join(list(v.values()))):
  106. print("'{}': {},".format(k, v))
  107. # a = [
  108. # 'good/bad-tempered',
  109. # 'in good/bad shape',
  110. # 'in the short/long term',
  111. # 'switch off/on',
  112. # "turn one's back (on sb/sth)",
  113. # 'the International Olympic Committee/IOC',
  114. # 'no longer/not…any longer', 'in good/poor condition', 'in addition (to sb/sth)', "easy-going/i:zi:'gəuiŋ/",
  115. # 'break away (from sb/sth)',
  116. # 'be / feel in the mood (for sth. / to do sth.)',
  117. # '(be) true of / for',
  118. # 'a great/good many',
  119. # 'absorbed in sth/sb',
  120. # 'commit oneself to (sth/doing sth/do sth)',
  121. # 'analyse [NAmE -ze]',
  122. # '(be) bound to (do) …',
  123. # 'be bound to (do) …',
  124. # '(be) bound to …',
  125. # 'set (a play, novel, etc.) in',
  126. # 'pi(π)',
  127. # 'pin (on)',
  128. # '2D(2-dimensional)',
  129. # 'AI (artificial intelligence)',
  130. # 'AR(Augmented Reality)',
  131. # 'MR(Mixed Reality)',
  132. # 'PhD (Doctor of Philosophy)',
  133. # 'VR(Virtual Reality)',
  134. # '(at) first hand',
  135. # '(be) allergic to',
  136. # 'am(ante meridiem)',
  137. # "when the cat's away (the mice will play)",
  138. #
  139. # ]
  140. # if re.search("[(())]", k) and k not in a:
  141. # k = re.sub("\s*/\s*", "/", k).replace("(", "(").replace(")", ")")
  142. # # k1 = re.sub("/[a-z]+$|/[a-z]+(?=\s)", "", k)
  143. # # k2 = re.sub("(?<=\s)[a-z]+/|^[a-z]+/", "", k)
  144. # k4 = ""
  145. # if "(be)" in k:
  146. # k3 = re.sub("\(be\)", " be ", k).replace(" ", " ").strip()
  147. # else:
  148. # k3 = re.sub("\(.*?\)", " ", k).replace(" ", " ").strip()
  149. # k4 = re.sub(r"\((.*?)\)", r" \1 ", k).replace(" ", " ").strip()
  150. # print("'{}': {},".format(k, v))
  151. # print("'{}': {},".format(k3, v))
  152. # if k4:
  153. # print("'{}': {},".format(k4, v))
  154. # # print("---------------------------")
  155. def phrase_classify(en_word):
  156. """
  157. 短语分类
  158. :return:
  159. """
  160. text = word_tokenize(en_word)
  161. ptag = pos_tag(text)
  162. # print("phrase_classify:::::", ptag)
  163. if ptag:
  164. if len(en_word.split(" ")) > 1:
  165. if ptag[0][1] in ["VB", "V"]:
  166. return "v-phrase"
  167. if ptag[0][1] == "IN":
  168. return "prep-phrase"
  169. if len(ptag) == 2 and ptag[1][1] == "NN" and ptag[0][1] in ["NN", "ADJ", "JJ", "RB"]:
  170. return "n-phrase"
  171. if "NN+IN+NN" in "+".join([i[1] for i in ptag]):
  172. return "n-phrase"
  173. else:
  174. return ptag[0][-1]
  175. # (重写)MyThread.py线程类,使其能够返回值
  176. # class MyThread(threading.Thread):
  177. # def __init__(self, func, args=(), kwargs=None):
  178. # super(MyThread, self).__init__()
  179. # self.func = func
  180. # self.args = args
  181. # self.kwargs = kwargs
  182. #
  183. # # 重写后的run()方法不再执行以前的run()方法了
  184. # # 注意:即使加了return也不会返回值,如return self.func(*self.args)
  185. # def run(self):
  186. # if self.kwargs:
  187. # self.result = self.func(self.kwargs["arg1"], self.kwargs["arg2"])
  188. # else:
  189. # self.result = self.func(*self.args)
  190. #
  191. # def get_result(self):
  192. # # return self.result
  193. # # 必须等待线程执行完毕,如果线程还未执行完毕就去获取result是没有结果的
  194. # threading.Thread.join(self)
  195. # try:
  196. # return self.result
  197. # except Exception:
  198. # return None
  199. # 词性标注THULAC:http://thulac.thunlp.org/demo
  200. if __name__ == '__main__':
  201. from pprint import pprint
  202. # 举例说明
  203. # str1 = "现在什么时候了"
  204. # str2 = "什么时候了现在"
  205. str1 = "A highly recommended book"
  206. str2 = "a highly recommendable book"
  207. str11 = jieba.lcut(str1)
  208. str22 = jieba.lcut(str2)
  209. # print('str1=' + str1) # jieba分词后
  210. # print(str11) # jieba分词后
  211. # diff_result = difflib.SequenceMatcher(None, str1, str2).ratio()
  212. # print('方法一:Python标准库difflib的计算分值:' + str(diff_result))
  213. # print('方法二:Python标准库fuzz的计算分值:' + str(fuzz.ratio(str1, str2) / 100))
  214. st1 = time.time()
  215. print('方法三:编辑距离的计算分值:' + str(edit_similar(str11, str22)),"tt1:",time.time()-st1)
  216. st2 = time.time()
  217. # print('方法四:余弦相似度的计算分值:' + str(cos_sim(str1, str2)), "tt2:", time.time()-st2)
  218. # ---------------------------------------------
  219. # edition = ["新人教", "新外研", "新牛津", "新北师大", "旧人教", "旧外研", "旧牛津", "旧北师大"]
  220. # # edition = ["旧北师大"]
  221. # word2mean_dict = {}
  222. # for edit in edition:
  223. # path = r"G:\zwj\WL\en2cn\files\教材义\教材单元汇总表格\{}单词全册.xlsx".format(edit)
  224. # res = words_classify_from_textbook(path, edit, word2mean_dict)
  225. # word2mean_dict.update(res)
  226. # pprint(word2mean_dict)
  227. # ---------------------------------------------------------
  228. # path = r"G:\zwj\WL\en2cn\files\教材义\初中考纲1600词【词汇+词性词义】.xlsx"
  229. # df1 = pd.read_excel(path)
  230. # word2mean = {}
  231. # for i, row in df1.iterrows():
  232. # # print(row["单词"])
  233. # word = re.sub(r"^\*", "", row["词汇"].strip())
  234. # mean = row["词性词义"].replace("\n", "<br>").replace("\ue009", " ")
  235. # if word in word2mean:
  236. # print(word)
  237. # else:
  238. # word2mean[word] = mean
  239. # pprint(word2mean)
  240. # ----------------------------------------------------
  241. from Words.word_dict_from_textbook import word2mean_high
  242. # from Words.Phrase_dict import phrases_dict_bing
  243. # # word_screen(word2mean_high)
  244. # for k, v in phrases_dict_bing.items():
  245. # zh_full_wash(v, source="bing")
  246. # ----------------------------------------------
  247. # text = word_tokenize('categorise')
  248. # ptag = pos_tag(text)
  249. # print(ptag)
  250. # t1 = time.time()
  251. # phrase_classify('evaporate')
  252. # print(time.time() - t1)
  253. # from Words.Phrase_dict import phrases_dict_tk
  254. # for k, v in phrases_dict_tk.items():
  255. # if " " in k:
  256. # print(k)
  257. # ----------------根据批改数据将错误意思汇总----------------
  258. from Words.Phrase_dict import errmean_en_dict
  259. # path = r"G:\zwj\WL\en2cn\files\复评文件\英译汉自动批改9-29.xlsx"
  260. # df = pd.read_excel(path, sheet_name="Sheet1")
  261. # df = df.dropna(subset="错误意思", axis=0)
  262. # # print(df["错误意思"])
  263. # # errmean_word = {}
  264. # for i, row in df.iterrows():
  265. # # print(row["单词"], row["错误意思"])
  266. # enword = row["单词"].strip()
  267. # errmean = re.sub("\.\s*$", "", row["错误意思"]).strip()
  268. # if enword not in errmean_en_dict:
  269. # errmean_en_dict[enword] = errmean
  270. # else:
  271. # for j in re.split("[;;]", errmean):
  272. # if j not in errmean_en_dict[enword] and (";"+j) not in errmean_en_dict[enword]:
  273. # errmean_en_dict[enword] += ";" + j
  274. # pprint(errmean_en_dict)