123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- import re
- import pytesseract
- from PIL import Image
- from correct import correct_sent
- chinese_pat = re.compile(r"[\u4e00-\u9fa5]+")
- english_pat = re.compile(r"[a-zA-Z]+")
- def is_chinese(s):
- return chinese_pat.match(s) != None
- def is_english(s):
- return english_pat.match(s) != None
- def is_chinese_line(line, cc_ce_difference=5):
- '''return 1 if it is chinese line
- -1 if it is english line
- 0 if not sure'''
- cc = 0
- ce = 0
- for c in line:
- if is_chinese(c):
- cc += 1
- elif is_english(c):
- ce += 1
- if abs(cc - ce) >= cc_ce_difference:
- return 1 if cc > ce else -1
- if cc == 0 and ce == 0:
- return 0
- else:
- return 1 if cc > ce else -1
- space_pat = re.compile("\s+")
- def remove_space(s):
- return space_pat.sub("", s)
- def process_orc_text(text):
- lines = []
- for line in text.split("\n"):
- # 过滤掉空白行
- line = line.strip()
- if not line:
- continue
- chi = is_chinese_line(line)
- if chi == 1:
- # 中文行去掉空格
- line = remove_space(line)
- elif chi == -1:
- # 英文行进行错误修改
- line = correct_sent(line)
- else:
- pass
- lines.append(line)
- text = "\n".join(lines)
- return text
- def img2text(imgpath, lang='chi_sim'):
- im = Image.open(imgpath)
- text = pytesseract.image_to_string(im,
- # lang='eng',
- # lang='chi_sim',
- lang=lang,
- )
- return process_orc_text(text)
|