auto_check.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. # !/usr/bin/python
  2. # -*- coding:utf-8 -*-
  3. import sys
  4. import pinyin
  5. import jieba
  6. import string
  7. import os
  8. from my_config import dirpath
  9. # import re
  10. FILE_PATH = os.path.join(dirpath, "Autochecker4Chinese/token_freq_pos%40350k_jieba.txt")
  11. # G:\zwj\WL\en2cn\Autochecker4Chinese\token_freq_pos%40350k_jieba.txt
  12. PUNCTUATION_LIST = string.punctuation
  13. PUNCTUATION_LIST += "。,?:;{}[]‘“”《》/!%……()"
  14. def construct_dict(file_path):
  15. word_freq = {}
  16. with open(file_path, "r", encoding="utf-8") as f:
  17. for line in f:
  18. info = line.split()
  19. word = info[0]
  20. frequency = info[1]
  21. word_freq[word] = frequency
  22. return word_freq
  23. phrase_freq = construct_dict(FILE_PATH)
  24. def load_cn_words_dict(file_path):
  25. cn_words_dict = ""
  26. with open(file_path, "r", encoding="utf-8") as f:
  27. for word in f:
  28. cn_words_dict += word.strip()
  29. return cn_words_dict
  30. def edits1(phrase, cn_words_dict):
  31. """
  32. All edits that are one edit away from `phrase`
  33. :param phrase:
  34. :param cn_words_dict:
  35. :return:
  36. """
  37. splits = [(phrase[:i], phrase[i:]) for i in range(len(phrase) + 1)]
  38. deletes = [L + R[1:] for L, R in splits if R]
  39. transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
  40. replaces = [L + c + R[1:] for L, R in splits if R for c in cn_words_dict]
  41. inserts = [L + c + R for L, R in splits for c in cn_words_dict]
  42. return set(deletes + transposes + replaces + inserts)
  43. def known(phrases): return set(phrase for phrase in phrases if phrase in phrase_freq)
  44. def get_candidates(error_phrase):
  45. candidates_1st_order = []
  46. candidates_2nd_order = []
  47. candidates_3nd_order = []
  48. error_pinyin = pinyin.get(error_phrase, format="strip", delimiter="/").encode("utf-8")
  49. cn_words_dict = load_cn_words_dict(os.path.join(dirpath, "Autochecker4Chinese/cn_dict.txt"))
  50. candidate_phrases = list(known(edits1(error_phrase, cn_words_dict)))
  51. for candidate_phrase in candidate_phrases:
  52. candidate_pinyin = pinyin.get(candidate_phrase, format="strip", delimiter="/").encode("utf-8")
  53. if candidate_pinyin == error_pinyin:
  54. candidates_1st_order.append(candidate_phrase)
  55. elif candidate_pinyin.decode("utf-8").split("/")[0] == error_pinyin.decode("utf-8").split("/")[0]:
  56. candidates_2nd_order.append(candidate_phrase)
  57. else:
  58. candidates_3nd_order.append(candidate_phrase)
  59. return candidates_1st_order, candidates_2nd_order, candidates_3nd_order
  60. def auto_correct(error_phrase, flag="max"):
  61. """
  62. flag="max":返回最可能的候选项
  63. flag="all":返回所有候选项
  64. :param error_phrase:
  65. :param flag:
  66. :return:
  67. """
  68. c1_order, c2_order, c3_order = get_candidates(error_phrase)
  69. # print(c1_order)
  70. # print(c2_order)
  71. # print(c3_order)
  72. if flag == "max":
  73. if c1_order:
  74. return max(c1_order, key=phrase_freq.get)
  75. elif c2_order:
  76. return max(c2_order, key=phrase_freq.get)
  77. elif c3_order:
  78. return max(c3_order, key=phrase_freq.get)
  79. else:
  80. return error_phrase
  81. else:
  82. if c1_order:
  83. return c1_order
  84. elif c2_order:
  85. return c2_order
  86. elif c3_order:
  87. return c3_order
  88. else:
  89. return []
  90. def auto_correct_sentence(error_sentence, verbose=True):
  91. jieba_cut = jieba.cut(error_sentence, cut_all=False)
  92. seg_list = "\t".join(jieba_cut).split("\t")
  93. correct_sentence = ""
  94. for phrase in seg_list:
  95. correct_phrase = phrase
  96. # check if item is a punctuation
  97. if phrase not in PUNCTUATION_LIST:
  98. # check if the phrase in our dict, if not then it is a misspelled phrase
  99. if phrase not in phrase_freq.keys():
  100. correct_phrase = auto_correct(phrase)
  101. if verbose:
  102. print(phrase, correct_phrase)
  103. correct_sentence += correct_phrase
  104. return correct_sentence
  105. if __name__ == '__main__':
  106. import os
  107. err_sent_1 = '机七学习是人工智能领遇最能体现智能的一个分知!'
  108. correct_sent = auto_correct_sentence(err_sent_1)
  109. print("original sentence:" + err_sent_1 + "\n==>\n" + "corrected sentence:" + correct_sent)
  110. error_phrase_1 = "呕涂" # should be "呕吐"
  111. error_phrase_2 = "东方之朱" # should be "东方之珠"
  112. error_phrase_3 = "启程安排" # should be "沙龙"
  113. # print(error_phrase_1, auto_correct(error_phrase_1))
  114. # print(error_phrase_2, auto_correct(error_phrase_2))
  115. print(error_phrase_3, auto_correct(error_phrase_3, flag="all"))