## Solutions of autochecker for chinese

### 1. Construct a detecter

In [1]:
# Step1 : construct a dict to detect the misspelled chinese phrase
# key is the chinese word, value is its corresponding frequency appeared in corpus
# you can finish this step by collecting corpus from the internet
# or you can choose a more easy way, load some dicts already created by others

In [2]:
def construct_dict( file_path ):
 
 word_freq = {}
 with open(file_path, "r") as f:
 for line in f:
 info = line.split()
 word = info[0]
 frequency = info[1]
 word_freq[word] = frequency
 
 return word_freq

In [3]:
FILE_PATH = "./token_freq_pos%40350k_jieba.txt"

phrase_freq = construct_dict( FILE_PATH )

In [4]:
print( type(phrase_freq) )
print( len(phrase_freq) )


349045


### 2. Construct an autocorrecter

In [5]:
import pinyin

In [11]:
# list for chinese words
# read from the words.dic
def load_cn_words_dict( file_path ):
 cn_words_dict = ""
 with open(file_path, "r") as f:
 for word in f:
 cn_words_dict += word.strip().decode("utf-8")
 return cn_words_dict

In [12]:
# function calculate the edite distance from the chinese phrase 
def edits1(phrase, cn_words_dict):
 "All edits that are one edit away from `phrase`."
 phrase = phrase.decode("utf-8")
 splits = [(phrase[:i], phrase[i:]) for i in range(len(phrase) + 1)]
 deletes = [L + R[1:] for L, R in splits if R]
 transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
 replaces = [L + c + R[1:] for L, R in splits if R for c in cn_words_dict]
 inserts = [L + c + R for L, R in splits for c in cn_words_dict]
 return set(deletes + transposes + replaces + inserts)

In [13]:
# return the phrease exist in phrase_freq
def known(phrases): return set(phrase for phrase in phrases if phrase.encode("utf-8") in phrase_freq)

In [14]:
# get the candidates phrase of the error phrase
# we sort the candidates phrase's importance according to their pinyin
# if the candidate phrase's pinyin exactly matches with the error phrase, we put them into first order
# if the candidate phrase's first word pinyin matches with the error phrase first word, we put them into second order
# else we put candidate phrase into the third order
def get_candidates( error_phrase ):
 
 candidates_1st_order = []
 candidates_2nd_order = []
 candidates_3nd_order = []
 
 error_pinyin = pinyin.get(error_phrase, format="strip", delimiter="/").encode("utf-8")
 cn_words_dict = load_cn_words_dict( "./cn_dict.txt" )
 candidate_phrases = list( known(edits1(error_phrase, cn_words_dict)) )
 
 for candidate_phrase in candidate_phrases:
 candidate_pinyin = pinyin.get(candidate_phrase, format="strip", delimiter="/").encode("utf-8")
 if candidate_pinyin == error_pinyin:
 candidates_1st_order.append(candidate_phrase)
 elif candidate_pinyin.split("/")[0] == error_pinyin.split("/")[0]:
 candidates_2nd_order.append(candidate_phrase)
 else:
 candidates_3nd_order.append(candidate_phrase)
 
 return candidates_1st_order, candidates_2nd_order, candidates_3nd_order

In [15]:
def auto_correct( error_phrase ):
 
 c1_order, c2_order, c3_order = get_candidates(error_phrase)
 # print c1_order, c2_order, c3_order
 if c1_order:
 return max(c1_order, key=phrase_freq.get )
 elif c2_order:
 return max(c2_order, key=phrase_freq.get )
 else:
 return max(c3_order, key=phrase_freq.get )

In [16]:
# test for the auto_correct 
error_phrase_1 = "呕涂" # should be "呕吐"
error_phrase_2 = "东方之朱" # should be "东方之珠"
error_phrase_3 = "沙拢" # should be "沙龙"

print error_phrase_1, auto_correct( error_phrase_1 )
print error_phrase_2, auto_correct( error_phrase_2 )
print error_phrase_3, auto_correct( error_phrase_3 )

呕涂 呕吐
东方之朱 东方之珠
沙拢 沙龙


### 3. Correct the misspelled phrase in a sentance 

In [17]:
# step 3 : Tokenization
# For any given sentence, use jieba do the segmentation
# Get segment list after segmentation is done
# check if the remain phrase exists in word_freq dict
# if not, then it is a misspelled phrase
# use auto_correct fun to correct the phrase

In [18]:
import jieba
import string
import re

In [19]:
PUNCTUATION_LIST = string.punctuation
PUNCTUATION_LIST += "。,?:;{}[]‘“”《》/!%……()"

In [21]:
def auto_correct_sentence( error_sentence, verbose=True):
 
 jieba_cut = jieba.cut(err_test.decode("utf-8"), cut_all=False)
 seg_list = "\t".join(jieba_cut).split("\t")
 
 correct_sentence = ""
 
 for phrase in seg_list:
 
 correct_phrase = phrase
 # check if item is a punctuation
 if phrase not in PUNCTUATION_LIST.decode("utf-8"):
 # check if the phrase in our dict, if not then it is a misspelled phrase
 if phrase.encode("utf-8") not in phrase_freq.keys():
 correct_phrase = auto_correct(phrase.encode("utf-8"))
 if verbose :
 print phrase, correct_phrase
 
 correct_sentence += correct_phrase
 
 if verbose:
 print correct_sentence
 return correct_sentence

In [23]:
err_sent = '机七学习是人工智能领遇最能体现智能的一个分知!'
correct_sent = auto_correct_sentence( err_sent )

机七 机器
领遇 领域
分知 分枝
机器学习是人工智能领域最能体现智能的一个分枝!


In [24]:
print correct_sent

机器学习是人工智能领域最能体现智能的一个分枝!
