Autochecker4Chinese.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. # !/usr/bin/python
  2. # -*- coding:utf-8 -*-
  3. __author__ = "zpgao"
  4. import sys
  5. import pinyin
  6. import jieba
  7. import string
  8. import re
  9. FILE_PATH = "./token_freq_pos%40350k_jieba.txt"
  10. PUNCTUATION_LIST = string.punctuation
  11. PUNCTUATION_LIST += "。,?:;{}[]‘“”《》/!%……()"
  12. def construct_dict(file_path):
  13. word_freq = {}
  14. with open(file_path, "r") as f:
  15. for line in f:
  16. info = line.split()
  17. word = info[0]
  18. frequency = info[1]
  19. word_freq[word] = frequency
  20. return word_freq
  21. def load_cn_words_dict(file_path):
  22. cn_words_dict = ""
  23. with open(file_path, "r") as f:
  24. for word in f:
  25. cn_words_dict += word.strip().decode("utf-8")
  26. return cn_words_dict
  27. def edits1(phrase, cn_words_dict):
  28. "All edits that are one edit away from `phrase`."
  29. phrase = phrase.decode("utf-8")
  30. splits = [(phrase[:i], phrase[i:]) for i in range(len(phrase) + 1)]
  31. deletes = [L + R[1:] for L, R in splits if R]
  32. transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
  33. replaces = [L + c + R[1:] for L, R in splits if R for c in cn_words_dict]
  34. inserts = [L + c + R for L, R in splits for c in cn_words_dict]
  35. return set(deletes + transposes + replaces + inserts)
  36. def known(phrases): return set(phrase for phrase in phrases if phrase.encode("utf-8") in phrase_freq)
  37. def get_candidates(error_phrase):
  38. candidates_1st_order = []
  39. candidates_2nd_order = []
  40. candidates_3nd_order = []
  41. error_pinyin = pinyin.get(error_phrase, format="strip", delimiter="/").encode("utf-8")
  42. cn_words_dict = load_cn_words_dict("./cn_dict.txt")
  43. candidate_phrases = list(known(edits1(error_phrase, cn_words_dict)))
  44. for candidate_phrase in candidate_phrases:
  45. candidate_pinyin = pinyin.get(candidate_phrase, format="strip", delimiter="/").encode("utf-8")
  46. if candidate_pinyin == error_pinyin:
  47. candidates_1st_order.append(candidate_phrase)
  48. elif candidate_pinyin.split("/")[0] == error_pinyin.split("/")[0]:
  49. candidates_2nd_order.append(candidate_phrase)
  50. else:
  51. candidates_3nd_order.append(candidate_phrase)
  52. return candidates_1st_order, candidates_2nd_order, candidates_3nd_order
  53. def auto_correct(error_phrase):
  54. c1_order, c2_order, c3_order = get_candidates(error_phrase)
  55. # print c1_order, c2_order, c3_order
  56. if c1_order:
  57. return max(c1_order, key=phrase_freq.get)
  58. elif c2_order:
  59. return max(c2_order, key=phrase_freq.get)
  60. else:
  61. return max(c3_order, key=phrase_freq.get)
  62. def auto_correct_sentence(error_sentence, verbose=True):
  63. jieba_cut = jieba.cut(error_sentence.decode("utf-8"), cut_all=False)
  64. seg_list = "\t".join(jieba_cut).split("\t")
  65. correct_sentence = ""
  66. for phrase in seg_list:
  67. correct_phrase = phrase
  68. # check if item is a punctuation
  69. if phrase not in PUNCTUATION_LIST.decode("utf-8"):
  70. # check if the phrase in our dict, if not then it is a misspelled phrase
  71. if phrase.encode("utf-8") not in phrase_freq.keys():
  72. correct_phrase = auto_correct(phrase.encode("utf-8"))
  73. if verbose:
  74. print
  75. phrase, correct_phrase
  76. correct_sentence += correct_phrase
  77. return correct_sentence
  78. phrase_freq = construct_dict(FILE_PATH)
  79. def main():
  80. pass
  81. if __name__ == "__main__":
  82. main()