import re import pytesseract from PIL import Image from correct import correct_sent chinese_pat = re.compile(r"[\u4e00-\u9fa5]+") english_pat = re.compile(r"[a-zA-Z]+") def is_chinese(s): return chinese_pat.match(s) != None def is_english(s): return english_pat.match(s) != None def is_chinese_line(line, cc_ce_difference=5): '''return 1 if it is chinese line -1 if it is english line 0 if not sure''' cc = 0 ce = 0 for c in line: if is_chinese(c): cc += 1 elif is_english(c): ce += 1 if abs(cc - ce) >= cc_ce_difference: return 1 if cc > ce else -1 if cc == 0 and ce == 0: return 0 else: return 1 if cc > ce else -1 space_pat = re.compile("\s+") def remove_space(s): return space_pat.sub("", s) def process_orc_text(text): lines = [] for line in text.split("\n"): # 过滤掉空白行 line = line.strip() if not line: continue chi = is_chinese_line(line) if chi == 1: # 中文行去掉空格 line = remove_space(line) elif chi == -1: # 英文行进行错误修改 line = correct_sent(line) else: pass lines.append(line) text = "\n".join(lines) return text def img2text(imgpath, lang='chi_sim'): im = Image.open(imgpath) text = pytesseract.image_to_string(im, # lang='eng', # lang='chi_sim', lang=lang, ) return process_orc_text(text)