# !/usr/bin/python # -*- coding:utf-8 -*- import sys import pinyin import jieba import string import os from my_config import dirpath # import re FILE_PATH = os.path.join(dirpath, "Autochecker4Chinese/token_freq_pos%40350k_jieba.txt") # G:\zwj\WL\en2cn\Autochecker4Chinese\token_freq_pos%40350k_jieba.txt PUNCTUATION_LIST = string.punctuation PUNCTUATION_LIST += "。,?:;{}[]‘“”《》/!%……()" def construct_dict(file_path): word_freq = {} with open(file_path, "r", encoding="utf-8") as f: for line in f: info = line.split() word = info[0] frequency = info[1] word_freq[word] = frequency return word_freq phrase_freq = construct_dict(FILE_PATH) def load_cn_words_dict(file_path): cn_words_dict = "" with open(file_path, "r", encoding="utf-8") as f: for word in f: cn_words_dict += word.strip() return cn_words_dict def edits1(phrase, cn_words_dict): """ All edits that are one edit away from `phrase` :param phrase: :param cn_words_dict: :return: """ splits = [(phrase[:i], phrase[i:]) for i in range(len(phrase) + 1)] deletes = [L + R[1:] for L, R in splits if R] transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] replaces = [L + c + R[1:] for L, R in splits if R for c in cn_words_dict] inserts = [L + c + R for L, R in splits for c in cn_words_dict] return set(deletes + transposes + replaces + inserts) def known(phrases): return set(phrase for phrase in phrases if phrase in phrase_freq) def get_candidates(error_phrase): candidates_1st_order = [] candidates_2nd_order = [] candidates_3nd_order = [] error_pinyin = pinyin.get(error_phrase, format="strip", delimiter="/").encode("utf-8") cn_words_dict = load_cn_words_dict(os.path.join(dirpath, "Autochecker4Chinese/cn_dict.txt")) candidate_phrases = list(known(edits1(error_phrase, cn_words_dict))) for candidate_phrase in candidate_phrases: candidate_pinyin = pinyin.get(candidate_phrase, format="strip", delimiter="/").encode("utf-8") if candidate_pinyin == error_pinyin: candidates_1st_order.append(candidate_phrase) elif candidate_pinyin.decode("utf-8").split("/")[0] == error_pinyin.decode("utf-8").split("/")[0]: candidates_2nd_order.append(candidate_phrase) else: candidates_3nd_order.append(candidate_phrase) return candidates_1st_order, candidates_2nd_order, candidates_3nd_order def auto_correct(error_phrase, flag="max"): """ flag="max":返回最可能的候选项 flag="all":返回所有候选项 :param error_phrase: :param flag: :return: """ c1_order, c2_order, c3_order = get_candidates(error_phrase) # print(c1_order) # print(c2_order) # print(c3_order) if flag == "max": if c1_order: return max(c1_order, key=phrase_freq.get) elif c2_order: return max(c2_order, key=phrase_freq.get) elif c3_order: return max(c3_order, key=phrase_freq.get) else: return error_phrase else: if c1_order: return c1_order elif c2_order: return c2_order elif c3_order: return c3_order else: return [] def auto_correct_sentence(error_sentence, verbose=True): jieba_cut = jieba.cut(error_sentence, cut_all=False) seg_list = "\t".join(jieba_cut).split("\t") correct_sentence = "" for phrase in seg_list: correct_phrase = phrase # check if item is a punctuation if phrase not in PUNCTUATION_LIST: # check if the phrase in our dict, if not then it is a misspelled phrase if phrase not in phrase_freq.keys(): correct_phrase = auto_correct(phrase) if verbose: print(phrase, correct_phrase) correct_sentence += correct_phrase return correct_sentence if __name__ == '__main__': import os err_sent_1 = '机七学习是人工智能领遇最能体现智能的一个分知!' correct_sent = auto_correct_sentence(err_sent_1) print("original sentence:" + err_sent_1 + "\n==>\n" + "corrected sentence:" + correct_sent) error_phrase_1 = "呕涂" # should be "呕吐" error_phrase_2 = "东方之朱" # should be "东方之珠" error_phrase_3 = "启程安排" # should be "沙龙" # print(error_phrase_1, auto_correct(error_phrase_1)) # print(error_phrase_2, auto_correct(error_phrase_2)) print(error_phrase_3, auto_correct(error_phrase_3, flag="all"))