exam_segment.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538
  1. import re
  2. import json
  3. import glob
  4. import os
  5. import cv2
  6. import numpy as np
  7. import matplotlib.pyplot as plt
  8. problem_number_pattern = re.compile(r'\s*(\d+)')
  9. number_pattern = re.compile(r'(\d+)')
  10. sub_problem_number_pattern = re.compile(r'\s*\((\d+)')
  11. max_number = 99 # 最大题目数
  12. min_number = 0 # 最小题目数
  13. def get_respond_from_json(json_file):
  14. with open(json_file, 'r', encoding='UTF-8') as f:
  15. resp = json.load(f)
  16. return resp
  17. def get_number_position(words_result, max_number=max_number, left_position=0, right_position=0):
  18. # 获取以数字开头的位置, 保留num<=max_number以及字符位置位于[left_position,right_position]的数
  19. numbers = []
  20. for line_index in range(len(words_result)):
  21. line = words_result[line_index]
  22. #print('**************************************')
  23. #print(line['words'])
  24. #print(line['chars'][:2])
  25. m = problem_number_pattern.match(line['words'])
  26. if m:
  27. location = line['chars'][m.start(1)]['location']
  28. number = line['words'][m.start(1):m.end(1)]
  29. center = location['left'] + location['width'] // 2
  30. if int(number) <= max_number and center >= left_position:
  31. if right_position == 0:
  32. numbers.append(
  33. {'number': number, 'center': center, 'line': line_index, 'location': line['location']})
  34. elif center <= right_position:
  35. numbers.append(
  36. {'number': number, 'center': center, 'line': line_index, 'location': line['location']})
  37. #print(number, center, location)
  38. #print(line['chars'][m.start(1)])
  39. return numbers
  40. def get_number_list(numbers, shift_limit=50):
  41. # 获取横坐标相近的数字序列
  42. number_list = []
  43. for number in numbers:
  44. not_found_flag = 1
  45. for single_list in number_list:
  46. if abs(number['center']-single_list[-1]['center']) <= shift_limit:
  47. single_list.append(number)
  48. not_found_flag = 0
  49. #break
  50. if not_found_flag:
  51. single_list = []
  52. single_list.append(number)
  53. number_list.append(single_list)
  54. return number_list
  55. def get_longest_sequence(sequence, limit, type='l'):
  56. # 获取limit之下或之上的最长连续序列
  57. flag = [[0, 0], [0, 0]]
  58. for i in range(len(sequence)):
  59. if type == 'l':
  60. f = sequence[i] <= limit
  61. elif type == 'h':
  62. f = sequence[i] >= limit
  63. if f:
  64. if i == flag[1][1]:
  65. flag[1][1] += 1
  66. else:
  67. if flag[1][1] - flag[1][0] > flag[0][1] - flag[0][0]:
  68. flag[0][:] = flag[1][:]
  69. flag[1][:] = [i, i + 1]
  70. if flag[1][1] - flag[1][0] > flag[0][1] - flag[0][0]:
  71. flag[0][:] = flag[1][:]
  72. return flag[0][:]
  73. def get_number_sequence(numbers, max_gap=5, min_number=min_number):
  74. # 数列连续性判断
  75. number_sequence = []
  76. return number_sequence
  77. def get_problem_list(number_list):
  78. # 选取题号序列
  79. # rule1: 横坐标最小
  80. # rule2: 序列连续性?
  81. # rule3: 整体题号连续性?
  82. if number_list:
  83. index = 0
  84. left = number_list[index][0]['center']
  85. else:
  86. return []
  87. for i in range(1, len(number_list)):
  88. if number_list[i][0]['center'] < left:
  89. index = i
  90. left = number_list[i][0]['center']
  91. return number_list[index]
  92. def get_double_page_number(words_result, img_width, left_ratio=0.4, right_ratio=0.6):
  93. left = int(left_ratio * img_width)
  94. right = int(right_ratio * img_width)
  95. numbers = []
  96. for line in words_result:
  97. for char in line['chars']:
  98. center = int(char['location']['left']) + int(char['location']['width']) // 2
  99. if number_pattern.match(char['char']) and left <= center <= right:
  100. char.update(center=center)
  101. numbers.append(char)
  102. double_page_numbers = get_number_list(numbers)
  103. for d in double_page_numbers:
  104. if len(d) >= 2:
  105. return True, double_page_numbers
  106. return double_page_numbers
  107. def image_projection(image, left_ratio, right_ratio, top_ratio=0.2, bottom_ratio=0.9, gap=20):
  108. # 图像投影projection = [counts, positions]
  109. image = np.asarray(image)
  110. image = 255 - image
  111. height = image.shape[0]
  112. width = image.shape[1]
  113. top = int(height * top_ratio)
  114. bottom = int(height * bottom_ratio)
  115. left = int(width * left_ratio)
  116. right = int(width * right_ratio)
  117. # col_num = (right - left + 1) // gap
  118. # right = left + col_num * gap
  119. projection = np.zeros((2, len(range(left, right-gap, gap))), dtype=np.int)
  120. projection[1, :] = np.asarray(range(left, right-gap, gap), dtype=np.int)
  121. projection[0, :] = np.sum(np.sum(np.hsplit(
  122. image[top:bottom, left:projection[1, -1]+gap], projection.shape[1]), axis=1), axis=1) // (bottom - top)
  123. return projection
  124. def word_projection(words_result, image_shape, left_ratio, right_ratio, top_ratio=0.2, bottom_ratio=0.9, gap=20):
  125. # 字符投影word_count = [counts, positions]
  126. height = image_shape[0]
  127. width = image_shape[1]
  128. left = int(width * left_ratio)
  129. right = int(width * right_ratio)
  130. top = int(height * top_ratio)
  131. bottom = int(height * bottom_ratio)
  132. word_count = np.zeros((2, len(range(left, right-gap, gap))), dtype=np.int)
  133. word_count[1, :] = np.asarray(range(left, right-gap, gap), dtype=np.int)
  134. for line in words_result:
  135. if top < line['location']['top'] < bottom:
  136. for char in line['chars']:
  137. center = char['location']['left'] + char['location']['width'] // 2
  138. for i in range(word_count.shape[1]):
  139. if 0 <= center - word_count[1, i] < gap:
  140. word_count[0, i] += 1
  141. return word_count
  142. def check_seal_line(words_result, image, type='left', gap=20):
  143. # 检查是否有密封线,返回密封线横坐标
  144. projection_limit = 80
  145. wc_limit = 0
  146. seal_limit = 3
  147. image = np.asarray(image)
  148. height, width = image.shape[:2]
  149. if height / width < 1:
  150. if type == 'left':
  151. # 检查左密封线
  152. length_limit = 5
  153. left_ratio = 0
  154. right_ratio = 0.15
  155. word_count = word_projection(
  156. words_result, (height, width), left_ratio=left_ratio, right_ratio=right_ratio, gap=gap)
  157. image_count = image_projection(image, left_ratio=left_ratio, right_ratio=right_ratio, gap=gap)
  158. seal_flag = np.sum(image_count[0, :length_limit] > projection_limit)
  159. if seal_flag < seal_limit:
  160. # 判定无密封线
  161. return 0
  162. else:
  163. # 获取数字开头的位置
  164. numbers = get_number_position(
  165. words_result, left_position=length_limit*gap, right_position=right_ratio*width)
  166. right_flag = right_ratio * width
  167. for number in numbers:
  168. right_flag = min(right_flag, number['center'])
  169. for i in range(word_count.shape[1]-1, -1, -1):
  170. if word_count[0, i] <= wc_limit:
  171. if length_limit*gap <= word_count[1, i] <= right_flag:
  172. return word_count[1, i]
  173. return length_limit * gap
  174. elif type == 'right':
  175. # 检查右密封线
  176. left_ratio = 0.85
  177. right_ratio = 1
  178. word_count = word_projection(words_result, (height, width), left_ratio=left_ratio, right_ratio=right_ratio)
  179. image_count = image_projection(image, left_ratio=left_ratio, right_ratio=right_ratio)
  180. # seal_flag = np.sum(image_count[0, -length_limit:] > projection_limit)
  181. # if seal_flag < seal_limit:
  182. # return 0
  183. # else:
  184. # for i in range(word_count.shape[1]-length_limit, -1, -1):
  185. # if word_count[0, i] > wc_limit and image_count[0, i] <= projection_limit:
  186. # return word_count[1, i] + 2 * gap
  187. # return width - length_limit * gap
  188. for i in range(word_count.shape[1]-1, -1, -1):
  189. if word_count[0, i] > wc_limit:
  190. if image_count[0, i-1] <= projection_limit and word_count[0, i-1] + word_count[0, i-2] > 0:
  191. return word_count[1, i-1] + gap
  192. return 0
  193. else:
  194. return 0
  195. def check_double_page(words_result, image, height_to_width_ratio=1, wc_limit=2):
  196. # 检查是否有分页, 返回分割线横坐标
  197. image = np.asarray(image)
  198. height = image.shape[0]
  199. width = image.shape[1]
  200. flag = [[0, 0], [0, 0]]
  201. if height / width < height_to_width_ratio:
  202. word_count = word_projection(words_result, (height, width), left_ratio=0.4, right_ratio=0.6)
  203. for i in range(word_count.shape[1]):
  204. if word_count[0, i] <= wc_limit:
  205. if i == flag[1][1]:
  206. flag[1][1] += 1
  207. else:
  208. if flag[1][1] - flag[1][0] > flag[0][1] - flag[0][0]:
  209. flag[0][:] = flag[1][:]
  210. flag[1][:] = [i, i+1]
  211. if flag[1][1] - flag[1][0] > flag[0][1] - flag[0][0]:
  212. return word_count[1, (flag[1][0]+flag[1][1])//2]
  213. elif flag[0][1]:
  214. return word_count[1, (flag[0][0] + flag[0][1]) // 2]
  215. else:
  216. return 0
  217. return 0
  218. # for i in range(word_count.shape[1]//2):
  219. # kplus = word_count.shape[1]//2 + i
  220. # kminus = word_count.shape[1]//2 - i
  221. # if word_count[0, kplus] <= wc_limit:
  222. # return word_count[1, kplus]
  223. # elif word_count[0, kminus] <= wc_limit:
  224. # return word_count[1, kminus]
  225. # return 0
  226. def get_line_from_chars(chars):
  227. # 从一行所有字符获取行的整体坐标
  228. if chars:
  229. xmin = chars[0]['location']['left']
  230. ymin = chars[0]['location']['top']
  231. xmax = chars[0]['location']['left'] + chars[0]['location']['width']
  232. ymax = chars[0]['location']['top'] + chars[0]['location']['height']
  233. for char in chars:
  234. if xmin > char['location']['left']:
  235. xmin = char['location']['left']
  236. if ymin > char['location']['top']:
  237. ymin = char['location']['top']
  238. if xmax < char['location']['left'] + char['location']['width']:
  239. xmax = char['location']['left'] + char['location']['width']
  240. if ymax < char['location']['top'] + char['location']['height']:
  241. ymax = char['location']['top'] + char['location']['height']
  242. result = {'width': xmax-xmin, 'top': ymin, 'left': xmin, 'height': ymax-ymin}
  243. return result
  244. else:
  245. return {}
  246. def get_box_from_lines(lines):
  247. # 获取包含所有行区域的整体坐标
  248. if lines:
  249. ymin = lines[0]['location']['top']
  250. ymax = lines[0]['location']['top'] + lines[0]['location']['height']
  251. xmin = lines[0]['location']['left']
  252. xmax = lines[0]['location']['left'] + lines[0]['location']['width']
  253. for line in lines:
  254. if xmin > line['location']['left']:
  255. xmin = line['location']['left']
  256. if ymin > line['location']['top']:
  257. ymin = line['location']['top']
  258. if xmax < line['location']['left'] + line['location']['width']:
  259. xmax = line['location']['left'] + line['location']['width']
  260. if ymax < line['location']['top'] + line['location']['height']:
  261. ymax = line['location']['top'] + line['location']['height']
  262. return {'xmin': xmin, 'ymin': ymin, 'xmax': xmax, 'ymax': ymax}
  263. else:
  264. return {}
  265. def split_line_for_double_pages(line, split_position):
  266. # 把单行按双页分割
  267. char_pattern = r'\s*\S'
  268. words_pattern = r''
  269. odd_page_line = {}
  270. even_page_line = {}
  271. odd_page_chars = []
  272. even_page_chars = []
  273. for char in line['chars']:
  274. center = char['location']['left'] + char['location']['width'] // 2
  275. if center <= split_position:
  276. odd_page_chars.append(char)
  277. else:
  278. even_page_chars.append(char)
  279. words_length = len(odd_page_chars)
  280. if words_length == 0:
  281. even_page_line = line
  282. elif len(even_page_chars) == 0:
  283. odd_page_line = line
  284. else:
  285. odd_page_location = get_line_from_chars(odd_page_chars)
  286. even_page_location = get_line_from_chars(even_page_chars)
  287. for i in range(words_length):
  288. words_pattern += char_pattern
  289. words_pattern = re.compile(words_pattern) # ubuntu上有问题
  290. match = words_pattern.match(line['words'])
  291. odd_page_words = match[0]
  292. even_page_words = line['words'][match.end():]
  293. odd_page_line = {'chars': odd_page_chars, 'location': odd_page_location, 'words': odd_page_words}
  294. if even_page_words:
  295. even_page_line = {'chars': even_page_chars, 'location': even_page_location, 'words': even_page_words}
  296. return odd_page_line, even_page_line
  297. def get_double_page_text(words_result, split_position):
  298. # 把文本按双页分割
  299. odd_page = []
  300. even_page = []
  301. for line in words_result:
  302. if line['location']['left'] + line['location']['width'] // 2 >= split_position:
  303. even_page.append(line)
  304. else:
  305. odd_page.append(line)
  306. # else:
  307. # odd_page_line, even_page_line = split_line_for_double_pages(line, split_position)
  308. # if odd_page_line:
  309. # odd_page.append(odd_page_line)
  310. # if even_page_line:
  311. # even_page.append(even_page_line)
  312. return [odd_page, even_page]
  313. # def get_double_page_text(words_result, split_position):
  314. # odd_page = []
  315. # even_page = []
  316. # for line in words_result:
  317. # odd_page_chars = []
  318. # even_page_chars = []
  319. # for char in line['chars']:
  320. # center = char['location']['left'] + char['location']['width'] // 2
  321. # if center <= split_position:
  322. # odd_page_chars.append(char)
  323. # else:
  324. # even_page_chars.append(char)
  325. # line_result = get_line_from_chars(odd_page_chars)
  326. # if line_result:
  327. # odd_page.append(line_result)
  328. # line_result = get_line_from_chars(even_page_chars)
  329. # if line_result:
  330. # even_page.append(line_result)
  331. # return [odd_page, even_page]
  332. def get_page_text(words_result, image):
  333. # 除去密封线,分页,获取页面文本结果
  334. left_seal_line = check_seal_line(words_result, image, type='left')
  335. if left_seal_line:
  336. words_result = get_double_page_text(words_result, left_seal_line)[1]
  337. right_seal_line = check_seal_line(words_result, image, type='right')
  338. if right_seal_line:
  339. words_result = get_double_page_text(words_result, right_seal_line)[0]
  340. split_position = check_double_page(words_result, image)
  341. if split_position:
  342. return get_double_page_text(words_result, split_position)
  343. else:
  344. return [words_result]
  345. def exam_segment(words_result):
  346. # 分割试卷区域
  347. numbers = get_number_position(words_result)
  348. number_list = get_number_list(numbers)
  349. group_list = get_problem_list(number_list)
  350. for i in range(len(group_list)-1):
  351. group_list[i].update(end_line=group_list[i+1]['line']-1)
  352. if len(group_list) >= 1:
  353. group_list[-1].update(end_line=len(words_result)-1)
  354. for g in group_list:
  355. ymin = g['location']['top']
  356. ymax = words_result[g['end_line']]['location']['top'] + words_result[g['end_line']]['location']['height']
  357. xmin = g['location']['left']
  358. xmax = g['location']['left'] + g['location']['width']
  359. for line in range(g['line'], g['end_line']+1):
  360. left = words_result[line]['location']['left']
  361. width = words_result[line]['location']['width']
  362. if xmin > left:
  363. xmin = left
  364. if xmax < left + width:
  365. xmax = left + width
  366. g.update(box={'xmin': xmin, 'ymin': ymin, 'xmax': xmax, 'ymax': ymax})
  367. return group_list
  368. def show_result(img_file, debug=1):
  369. image_color = cv2.imread(img_file)
  370. image = cv2.cvtColor(image_color, cv2.COLOR_BGR2GRAY)
  371. height = image.shape[0]
  372. width = image.shape[1]
  373. resp = get_respond_from_json(img_file.replace('.jpg', '_json.txt'))
  374. words_result = resp['words_result']
  375. print('**********{}*********'.format(os.path.split(img_file)[1]))
  376. numbers = get_number_position(words_result)
  377. number_list = get_number_list(numbers)
  378. problem_list = get_problem_list(number_list)
  379. group_list = exam_segment(words_result)
  380. #double_page_numbers = get_double_page_number(words_result, img.shape[1])
  381. if debug == 0:
  382. for line_index in range(len(words_result)):
  383. line = words_result[line_index]
  384. print('**************************************')
  385. print(line['words'])
  386. print('************All Numbers************')
  387. for num in numbers:
  388. print(num)
  389. for numbers in number_list:
  390. print('*******Number List********')
  391. for n in numbers:
  392. print(n)
  393. elif debug == 1:
  394. print('**********Problem List*********')
  395. for p in problem_list:
  396. print(p)
  397. print('**********Group List**********')
  398. for g in group_list:
  399. print(g)
  400. elif debug == 2:
  401. gap = 20
  402. middle_word_count = word_projection(words_result, (height, width), left_ratio=0.4, right_ratio=0.6, gap=gap)
  403. left_word_count = word_projection(words_result, (height, width), left_ratio=0, right_ratio=0.15, gap=gap)
  404. right_word_count = word_projection(words_result, (height, width), left_ratio=0.85, right_ratio=1, gap=gap)
  405. left_image_projection = image_projection(image, left_ratio=0, right_ratio=0.15, gap=gap)
  406. middle_image_projection = image_projection(image, left_ratio=0.4, right_ratio=0.6, gap=gap)
  407. right_image_projection = image_projection(image, left_ratio=0.85, right_ratio=1, gap=gap)
  408. print('**********Left Projection************')
  409. print(left_word_count)
  410. print(left_image_projection)
  411. #print(get_longest_sequence(left_word_count[0, :], 2))
  412. #print(get_longest_sequence(left_image_projection[0, :], 100, type='h'))
  413. print('**********Middle Projection************')
  414. print(middle_word_count)
  415. print(middle_image_projection)
  416. print('**********Right Projection************')
  417. print(right_word_count)
  418. print(right_image_projection)
  419. print('************Split Line****************')
  420. left_p = check_seal_line(words_result, image, type='left')
  421. right_p = check_seal_line(words_result, image, type='right')
  422. middle_p = check_double_page(words_result, image)
  423. print(left_p, middle_p, right_p)
  424. cv2.line(image_color, (left_p, 0), (left_p, height), (0, 0, 255), 5)
  425. cv2.line(image_color, (middle_p, 0), (middle_p, height), (0, 255, 0), 5)
  426. cv2.line(image_color, (right_p, 0), (right_p, height), (255, 0, 0), 5)
  427. cv2.namedWindow('image', cv2.WINDOW_NORMAL)
  428. cv2.imshow('image', image_color)
  429. if cv2.waitKey(0) == 27: # press ESC to exit
  430. exit(0)
  431. cv2.destroyAllWindows()
  432. elif debug == 3:
  433. page_text = get_page_text(words_result, image)
  434. if len(page_text) == 1:
  435. print('*************Single Page*********')
  436. for line in page_text[0]:
  437. print(line['words'])
  438. else:
  439. print('*************Odd Page**********')
  440. for line in page_text[0]:
  441. print(line['words'])
  442. print('************Even Page**********')
  443. for line in page_text[1]:
  444. print(line['words'])
  445. # elif style == 4:
  446. # print('***********Page Text***********')
  447. # page_result = get_page_text(words_result, image)
  448. # if len(page_result) == 1:
  449. # print('***********Single Page***********')
  450. # for line in page_result[0]:
  451. # print(line['words'])
  452. # elif len(page_result) == 2:
  453. # print('*********Odd************')
  454. # for line in page_result[0]:
  455. # print(line['words'])
  456. # print('********Even************')
  457. # for line in page_result[1]:
  458. # print(line['words'])
  459. # if __name__ == "__main__":
  460. # img_file = r'E:\data\test-problems\10.jpg'
  461. # # show_result(img_file, debug=2)
  462. # image_color = cv2.imread(img_file)
  463. # image = cv2.cvtColor(image_color, cv2.COLOR_BGR2GRAY)
  464. # height = image.shape[0]
  465. # width = image.shape[1]
  466. # resp = get_respond_from_json(img_file.replace('.jpg', '_json.txt'))
  467. # words_result = resp['words_result']
  468. # print('**********{}*********'.format(os.path.split(img_file)[1]))
  469. # text_list = get_page_text(words_result, image)
  470. #
  471. # # work_dir = r'E:\data\seal_line'
  472. # # for img_file in glob.glob(os.path.join(work_dir, '*.jpg')):
  473. # # show_result(img_file, style=2)