{ "cells": [ { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Solutions of autochecker for chinese" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### 1. Construct a detecter" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "# Step1 : construct a dict to detect the misspelled chinese phrase\n", "# key is the chinese word, value is its corresponding frequency appeared in corpus\n", "# you can finish this step by collecting corpus from the internet\n", "# or you can choose a more easy way, load some dicts already created by others" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "def construct_dict( file_path ):\n", " \n", " word_freq = {}\n", " with open(file_path, \"r\") as f:\n", " for line in f:\n", " info = line.split()\n", " word = info[0]\n", " frequency = info[1]\n", " word_freq[word] = frequency\n", " \n", " return word_freq" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "FILE_PATH = \"./token_freq_pos%40350k_jieba.txt\"\n", "\n", "phrase_freq = construct_dict( FILE_PATH )" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "349045\n" ] } ], "source": [ "print( type(phrase_freq) )\n", "print( len(phrase_freq) )" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### 2. Construct an autocorrecter" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "import pinyin" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "# list for chinese words\n", "# read from the words.dic\n", "def load_cn_words_dict( file_path ):\n", " cn_words_dict = \"\"\n", " with open(file_path, \"r\") as f:\n", " for word in f:\n", " cn_words_dict += word.strip().decode(\"utf-8\")\n", " return cn_words_dict" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "# function calculate the edite distance from the chinese phrase \n", "def edits1(phrase, cn_words_dict):\n", " \"All edits that are one edit away from `phrase`.\"\n", " phrase = phrase.decode(\"utf-8\")\n", " splits = [(phrase[:i], phrase[i:]) for i in range(len(phrase) + 1)]\n", " deletes = [L + R[1:] for L, R in splits if R]\n", " transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]\n", " replaces = [L + c + R[1:] for L, R in splits if R for c in cn_words_dict]\n", " inserts = [L + c + R for L, R in splits for c in cn_words_dict]\n", " return set(deletes + transposes + replaces + inserts)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "# return the phrease exist in phrase_freq\n", "def known(phrases): return set(phrase for phrase in phrases if phrase.encode(\"utf-8\") in phrase_freq)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "# get the candidates phrase of the error phrase\n", "# we sort the candidates phrase's importance according to their pinyin\n", "# if the candidate phrase's pinyin exactly matches with the error phrase, we put them into first order\n", "# if the candidate phrase's first word pinyin matches with the error phrase first word, we put them into second order\n", "# else we put candidate phrase into the third order\n", "def get_candidates( error_phrase ):\n", " \n", " candidates_1st_order = []\n", " candidates_2nd_order = []\n", " candidates_3nd_order = []\n", " \n", " error_pinyin = pinyin.get(error_phrase, format=\"strip\", delimiter=\"/\").encode(\"utf-8\")\n", " cn_words_dict = load_cn_words_dict( \"./cn_dict.txt\" )\n", " candidate_phrases = list( known(edits1(error_phrase, cn_words_dict)) )\n", " \n", " for candidate_phrase in candidate_phrases:\n", " candidate_pinyin = pinyin.get(candidate_phrase, format=\"strip\", delimiter=\"/\").encode(\"utf-8\")\n", " if candidate_pinyin == error_pinyin:\n", " candidates_1st_order.append(candidate_phrase)\n", " elif candidate_pinyin.split(\"/\")[0] == error_pinyin.split(\"/\")[0]:\n", " candidates_2nd_order.append(candidate_phrase)\n", " else:\n", " candidates_3nd_order.append(candidate_phrase)\n", " \n", " return candidates_1st_order, candidates_2nd_order, candidates_3nd_order" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "def auto_correct( error_phrase ):\n", " \n", " c1_order, c2_order, c3_order = get_candidates(error_phrase)\n", " # print c1_order, c2_order, c3_order\n", " if c1_order:\n", " return max(c1_order, key=phrase_freq.get )\n", " elif c2_order:\n", " return max(c2_order, key=phrase_freq.get )\n", " else:\n", " return max(c3_order, key=phrase_freq.get )" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "呕涂 呕吐\n", "东方之朱 东方之珠\n", "沙拢 沙龙\n" ] } ], "source": [ "# test for the auto_correct \n", "error_phrase_1 = \"呕涂\" # should be \"呕吐\"\n", "error_phrase_2 = \"东方之朱\" # should be \"东方之珠\"\n", "error_phrase_3 = \"沙拢\" # should be \"沙龙\"\n", "\n", "print error_phrase_1, auto_correct( error_phrase_1 )\n", "print error_phrase_2, auto_correct( error_phrase_2 )\n", "print error_phrase_3, auto_correct( error_phrase_3 )" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### 3. Correct the misspelled phrase in a sentance " ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "# step 3 : Tokenization\n", "# For any given sentence, use jieba do the segmentation\n", "# Get segment list after segmentation is done\n", "# check if the remain phrase exists in word_freq dict\n", "# if not, then it is a misspelled phrase\n", "# use auto_correct fun to correct the phrase" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import jieba\n", "import string\n", "import re" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "PUNCTUATION_LIST = string.punctuation\n", "PUNCTUATION_LIST += \"。,?:;{}[]‘“”《》/!%……()\"" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "def auto_correct_sentence( error_sentence, verbose=True):\n", " \n", " jieba_cut = jieba.cut(err_test.decode(\"utf-8\"), cut_all=False)\n", " seg_list = \"\\t\".join(jieba_cut).split(\"\\t\")\n", " \n", " correct_sentence = \"\"\n", " \n", " for phrase in seg_list:\n", " \n", " correct_phrase = phrase\n", " # check if item is a punctuation\n", " if phrase not in PUNCTUATION_LIST.decode(\"utf-8\"):\n", " # check if the phrase in our dict, if not then it is a misspelled phrase\n", " if phrase.encode(\"utf-8\") not in phrase_freq.keys():\n", " correct_phrase = auto_correct(phrase.encode(\"utf-8\"))\n", " if verbose :\n", " print phrase, correct_phrase\n", " \n", " correct_sentence += correct_phrase\n", " \n", " if verbose:\n", " print correct_sentence\n", " return correct_sentence" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "机七 机器\n", "领遇 领域\n", "分知 分枝\n", "机器学习是人工智能领域最能体现智能的一个分枝!\n" ] } ], "source": [ "err_sent = '机七学习是人工智能领遇最能体现智能的一个分知!'\n", "correct_sent = auto_correct_sentence( err_sent )" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "机器学习是人工智能领域最能体现智能的一个分枝!\n" ] } ], "source": [ "print correct_sent" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "nlp_interview", "language": "python", "name": "nlp_interview" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.13" } }, "nbformat": 4, "nbformat_minor": 2 }