group_pictures.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. import re
  2. import shutil
  3. import glob
  4. from pprint import pprint
  5. import segment.ocr.luo_ocr.ocr as luo_ocr
  6. # from pypinyin import lazy_pinyin
  7. from segment.ocr.split_topic_en import topic_type_line
  8. # def to_pinyin_camel(s):
  9. # '''文件123.txt'''
  10. # py_ls = lazy_pinyin(s)
  11. # py_camel = [py.capitalize() for py in py_ls]
  12. # return "".join(py_camel)
  13. #
  14. #
  15. # def rename_filename(filename):
  16. # "将文件名转变为拼音"
  17. # filename_en = to_pinyin_camel(filename)
  18. # try:
  19. # shutil.copy(filename, filename_en)
  20. # except shutil.SameFileError:
  21. # pass
  22. # return filename_en
  23. # def request_ocr(filename):
  24. # '''中文无法上传需要修改成英文'''
  25. # url = "http://117.50.17.141/ocr"
  26. # data = {}
  27. # filename = rename_filename(filename)
  28. # files = {"mydata": open(filename, "rb")}
  29. # r = requests.post(url, data, files=files)
  30. # print(filename)
  31. # print(r.json())
  32. # return r.json()['text']
  33. topic_start = re.compile("^\s*(\d+)\s*[\.、::,,]")
  34. topic_start2 = re.compile("^\s*[(<〈《]?(\d+)\)\s*[\.、::,,]?")
  35. def is_topic_start(s, subject):
  36. """开始节点"""
  37. if subject in ['math', 'english', 'chinese', 'physics', 'chemistry', 'biology']:
  38. if topic_start.match(s):
  39. return True
  40. elif subject == 'math':
  41. if topic_start2.match(s):
  42. return True
  43. return False
  44. else:
  45. raise ValueError("subject={} is not supported!".format(subject))
  46. # -------------------------符合下列条件的则为结束-------------------------
  47. topic_end = re.compile("D\s*[\.、::]")
  48. topic_end2 = re.compile("^\s*G\s*[\.、::]")
  49. def is_topic_end(s, subject):
  50. """结束节点"""
  51. if subject in ['math', 'english', 'chinese', 'physics', 'chemistry', 'biology']:
  52. if topic_end.search(s):
  53. return True
  54. elif subject == 'english':
  55. if topic_end2.match(s):
  56. return True
  57. return False
  58. else:
  59. raise ValueError("subject={} is not supported!".format(subject))
  60. # -------------------------符合下列条件的则为跳过舍去-------------------------
  61. topic_filter = re.compile("^\s*[一二三四五六七八九十]+\s*[\.、::]")
  62. general_filter = ['选择题', '单选题', '多选题',
  63. '填空题', '单空题', '多空题',
  64. '解答题', '简答题', '证明题',
  65. '选做题', '实验题', '第II卷',
  66. '第I卷', '第二卷', ]
  67. english_filter = [["听", "材料"], ["听", "对话"], ["听", "独白"],
  68. ['第一节'], ['第二节'], ['语言知识运用'], ['第II卷'],
  69. ['第二部分'], ['第三部分'], ['第四部分']]
  70. chinese_filter = re.compile("^\s*[((]\s*[一二三四五六七八九十]\s*[))]\s*[\.、::]?")
  71. def contains_all(s, words):
  72. for word in words:
  73. if all([w in s for w in word]):
  74. return True
  75. return False
  76. def is_topic_skip(s, subject):
  77. """判断该行是否可以去掉,跳跃节点"""
  78. if subject == 'english':
  79. return contains_all(s, english_filter)
  80. elif subject in ['math', 'chinese', 'physics', 'chemistry', 'biology']:
  81. if topic_filter.match(s):
  82. return True
  83. if subject == 'chinese':
  84. if chinese_filter.match(s):
  85. return True
  86. for topic_type in general_filter:
  87. if topic_type in s:
  88. return True
  89. return False
  90. else:
  91. raise ValueError("subject={} is not supported!".format(subject))
  92. # ----------------------action----------------------
  93. def group_pictures(pictures, subject=''):
  94. """Assume pictures are row based"""
  95. # texts = []
  96. # for picture in pictures:
  97. # t = luo_ocr.ocr_py(picture)
  98. # t = t.replace("\r", "").replace("\n", "")
  99. # # print(str(t))
  100. # texts.append(str(t))
  101. # # texts = bd_ocr(pictures)
  102. # # print(texts)
  103. texts = [luo_ocr.ocr_py(picture).replace("\r", "").replace("\n", "") for picture in pictures]
  104. groups = []
  105. start = 0
  106. for i, t in enumerate(texts):
  107. if is_topic_start(t, subject):
  108. groups.append([start, i])
  109. start = i
  110. elif is_topic_end(t, subject):
  111. groups.append([start, i + 1])
  112. start = i + 1
  113. elif is_topic_skip(t, subject):
  114. if i > start:
  115. groups.append([start, i])
  116. start = i + 1
  117. len_text = len(texts)
  118. if len_text > start:
  119. groups.append([start, len_text])
  120. return texts, groups