img2text.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. import re
  2. import pytesseract
  3. from PIL import Image
  4. from correct import correct_sent
  5. chinese_pat = re.compile(r"[\u4e00-\u9fa5]+")
  6. english_pat = re.compile(r"[a-zA-Z]+")
  7. def is_chinese(s):
  8. return chinese_pat.match(s) != None
  9. def is_english(s):
  10. return english_pat.match(s) != None
  11. def is_chinese_line(line, cc_ce_difference=5):
  12. '''return 1 if it is chinese line
  13. -1 if it is english line
  14. 0 if not sure'''
  15. cc = 0
  16. ce = 0
  17. for c in line:
  18. if is_chinese(c):
  19. cc += 1
  20. elif is_english(c):
  21. ce += 1
  22. if abs(cc - ce) >= cc_ce_difference:
  23. return 1 if cc > ce else -1
  24. if cc == 0 and ce == 0:
  25. return 0
  26. else:
  27. return 1 if cc > ce else -1
  28. space_pat = re.compile("\s+")
  29. def remove_space(s):
  30. return space_pat.sub("", s)
  31. def process_orc_text(text):
  32. lines = []
  33. for line in text.split("\n"):
  34. # 过滤掉空白行
  35. line = line.strip()
  36. if not line:
  37. continue
  38. chi = is_chinese_line(line)
  39. if chi == 1:
  40. # 中文行去掉空格
  41. line = remove_space(line)
  42. elif chi == -1:
  43. # 英文行进行错误修改
  44. line = correct_sent(line)
  45. else:
  46. pass
  47. lines.append(line)
  48. text = "\n".join(lines)
  49. return text
  50. def img2text(imgpath, lang='chi_sim'):
  51. im = Image.open(imgpath)
  52. text = pytesseract.image_to_string(im,
  53. # lang='eng',
  54. # lang='chi_sim',
  55. lang=lang,
  56. )
  57. return process_orc_text(text)