import re import json import glob import os import cv2 import numpy as np import matplotlib.pyplot as plt problem_number_pattern = re.compile(r'\s*(\d+)') number_pattern = re.compile(r'(\d+)') sub_problem_number_pattern = re.compile(r'\s*\((\d+)') max_number = 99 # 最大题目数 min_number = 0 # 最小题目数 def get_respond_from_json(json_file): with open(json_file, 'r', encoding='UTF-8') as f: resp = json.load(f) return resp def get_number_position(words_result, max_number=max_number, left_position=0, right_position=0): # 获取以数字开头的位置, 保留num<=max_number以及字符位置位于[left_position,right_position]的数 numbers = [] for line_index in range(len(words_result)): line = words_result[line_index] #print('**************************************') #print(line['words']) #print(line['chars'][:2]) m = problem_number_pattern.match(line['words']) if m: location = line['chars'][m.start(1)]['location'] number = line['words'][m.start(1):m.end(1)] center = location['left'] + location['width'] // 2 if int(number) <= max_number and center >= left_position: if right_position == 0: numbers.append( {'number': number, 'center': center, 'line': line_index, 'location': line['location']}) elif center <= right_position: numbers.append( {'number': number, 'center': center, 'line': line_index, 'location': line['location']}) #print(number, center, location) #print(line['chars'][m.start(1)]) return numbers def get_number_list(numbers, shift_limit=50): # 获取横坐标相近的数字序列 number_list = [] for number in numbers: not_found_flag = 1 for single_list in number_list: if abs(number['center']-single_list[-1]['center']) <= shift_limit: single_list.append(number) not_found_flag = 0 #break if not_found_flag: single_list = [] single_list.append(number) number_list.append(single_list) return number_list def get_longest_sequence(sequence, limit, type='l'): # 获取limit之下或之上的最长连续序列 flag = [[0, 0], [0, 0]] for i in range(len(sequence)): if type == 'l': f = sequence[i] <= limit elif type == 'h': f = sequence[i] >= limit if f: if i == flag[1][1]: flag[1][1] += 1 else: if flag[1][1] - flag[1][0] > flag[0][1] - flag[0][0]: flag[0][:] = flag[1][:] flag[1][:] = [i, i + 1] if flag[1][1] - flag[1][0] > flag[0][1] - flag[0][0]: flag[0][:] = flag[1][:] return flag[0][:] def get_number_sequence(numbers, max_gap=5, min_number=min_number): # 数列连续性判断 number_sequence = [] return number_sequence def get_problem_list(number_list): # 选取题号序列 # rule1: 横坐标最小 # rule2: 序列连续性? # rule3: 整体题号连续性? if number_list: index = 0 left = number_list[index][0]['center'] else: return [] for i in range(1, len(number_list)): if number_list[i][0]['center'] < left: index = i left = number_list[i][0]['center'] return number_list[index] def get_double_page_number(words_result, img_width, left_ratio=0.4, right_ratio=0.6): left = int(left_ratio * img_width) right = int(right_ratio * img_width) numbers = [] for line in words_result: for char in line['chars']: center = int(char['location']['left']) + int(char['location']['width']) // 2 if number_pattern.match(char['char']) and left <= center <= right: char.update(center=center) numbers.append(char) double_page_numbers = get_number_list(numbers) for d in double_page_numbers: if len(d) >= 2: return True, double_page_numbers return double_page_numbers def image_projection(image, left_ratio, right_ratio, top_ratio=0.2, bottom_ratio=0.9, gap=20): # 图像投影projection = [counts, positions] image = np.asarray(image) image = 255 - image height = image.shape[0] width = image.shape[1] top = int(height * top_ratio) bottom = int(height * bottom_ratio) left = int(width * left_ratio) right = int(width * right_ratio) # col_num = (right - left + 1) // gap # right = left + col_num * gap projection = np.zeros((2, len(range(left, right-gap, gap))), dtype=np.int) projection[1, :] = np.asarray(range(left, right-gap, gap), dtype=np.int) projection[0, :] = np.sum(np.sum(np.hsplit( image[top:bottom, left:projection[1, -1]+gap], projection.shape[1]), axis=1), axis=1) // (bottom - top) return projection def word_projection(words_result, image_shape, left_ratio, right_ratio, top_ratio=0.2, bottom_ratio=0.9, gap=20): # 字符投影word_count = [counts, positions] height = image_shape[0] width = image_shape[1] left = int(width * left_ratio) right = int(width * right_ratio) top = int(height * top_ratio) bottom = int(height * bottom_ratio) word_count = np.zeros((2, len(range(left, right-gap, gap))), dtype=np.int) word_count[1, :] = np.asarray(range(left, right-gap, gap), dtype=np.int) for line in words_result: if top < line['location']['top'] < bottom: for char in line['chars']: center = char['location']['left'] + char['location']['width'] // 2 for i in range(word_count.shape[1]): if 0 <= center - word_count[1, i] < gap: word_count[0, i] += 1 return word_count def check_seal_line(words_result, image, type='left', gap=20): # 检查是否有密封线,返回密封线横坐标 projection_limit = 80 wc_limit = 0 seal_limit = 3 image = np.asarray(image) height, width = image.shape[:2] if height / width < 1: if type == 'left': # 检查左密封线 length_limit = 5 left_ratio = 0 right_ratio = 0.15 word_count = word_projection( words_result, (height, width), left_ratio=left_ratio, right_ratio=right_ratio, gap=gap) image_count = image_projection(image, left_ratio=left_ratio, right_ratio=right_ratio, gap=gap) seal_flag = np.sum(image_count[0, :length_limit] > projection_limit) if seal_flag < seal_limit: # 判定无密封线 return 0 else: # 获取数字开头的位置 numbers = get_number_position( words_result, left_position=length_limit*gap, right_position=right_ratio*width) right_flag = right_ratio * width for number in numbers: right_flag = min(right_flag, number['center']) for i in range(word_count.shape[1]-1, -1, -1): if word_count[0, i] <= wc_limit: if length_limit*gap <= word_count[1, i] <= right_flag: return word_count[1, i] return length_limit * gap elif type == 'right': # 检查右密封线 left_ratio = 0.85 right_ratio = 1 word_count = word_projection(words_result, (height, width), left_ratio=left_ratio, right_ratio=right_ratio) image_count = image_projection(image, left_ratio=left_ratio, right_ratio=right_ratio) # seal_flag = np.sum(image_count[0, -length_limit:] > projection_limit) # if seal_flag < seal_limit: # return 0 # else: # for i in range(word_count.shape[1]-length_limit, -1, -1): # if word_count[0, i] > wc_limit and image_count[0, i] <= projection_limit: # return word_count[1, i] + 2 * gap # return width - length_limit * gap for i in range(word_count.shape[1]-1, -1, -1): if word_count[0, i] > wc_limit: if image_count[0, i-1] <= projection_limit and word_count[0, i-1] + word_count[0, i-2] > 0: return word_count[1, i-1] + gap return 0 else: return 0 def check_double_page(words_result, image, height_to_width_ratio=1, wc_limit=2): # 检查是否有分页, 返回分割线横坐标 image = np.asarray(image) height = image.shape[0] width = image.shape[1] flag = [[0, 0], [0, 0]] if height / width < height_to_width_ratio: word_count = word_projection(words_result, (height, width), left_ratio=0.4, right_ratio=0.6) for i in range(word_count.shape[1]): if word_count[0, i] <= wc_limit: if i == flag[1][1]: flag[1][1] += 1 else: if flag[1][1] - flag[1][0] > flag[0][1] - flag[0][0]: flag[0][:] = flag[1][:] flag[1][:] = [i, i+1] if flag[1][1] - flag[1][0] > flag[0][1] - flag[0][0]: return word_count[1, (flag[1][0]+flag[1][1])//2] elif flag[0][1]: return word_count[1, (flag[0][0] + flag[0][1]) // 2] else: return 0 return 0 # for i in range(word_count.shape[1]//2): # kplus = word_count.shape[1]//2 + i # kminus = word_count.shape[1]//2 - i # if word_count[0, kplus] <= wc_limit: # return word_count[1, kplus] # elif word_count[0, kminus] <= wc_limit: # return word_count[1, kminus] # return 0 def get_line_from_chars(chars): # 从一行所有字符获取行的整体坐标 if chars: xmin = chars[0]['location']['left'] ymin = chars[0]['location']['top'] xmax = chars[0]['location']['left'] + chars[0]['location']['width'] ymax = chars[0]['location']['top'] + chars[0]['location']['height'] for char in chars: if xmin > char['location']['left']: xmin = char['location']['left'] if ymin > char['location']['top']: ymin = char['location']['top'] if xmax < char['location']['left'] + char['location']['width']: xmax = char['location']['left'] + char['location']['width'] if ymax < char['location']['top'] + char['location']['height']: ymax = char['location']['top'] + char['location']['height'] result = {'width': xmax-xmin, 'top': ymin, 'left': xmin, 'height': ymax-ymin} return result else: return {} def get_box_from_lines(lines): # 获取包含所有行区域的整体坐标 if lines: ymin = lines[0]['location']['top'] ymax = lines[0]['location']['top'] + lines[0]['location']['height'] xmin = lines[0]['location']['left'] xmax = lines[0]['location']['left'] + lines[0]['location']['width'] for line in lines: if xmin > line['location']['left']: xmin = line['location']['left'] if ymin > line['location']['top']: ymin = line['location']['top'] if xmax < line['location']['left'] + line['location']['width']: xmax = line['location']['left'] + line['location']['width'] if ymax < line['location']['top'] + line['location']['height']: ymax = line['location']['top'] + line['location']['height'] return {'xmin': xmin, 'ymin': ymin, 'xmax': xmax, 'ymax': ymax} else: return {} def split_line_for_double_pages(line, split_position): # 把单行按双页分割 char_pattern = r'\s*\S' words_pattern = r'' odd_page_line = {} even_page_line = {} odd_page_chars = [] even_page_chars = [] for char in line['chars']: center = char['location']['left'] + char['location']['width'] // 2 if center <= split_position: odd_page_chars.append(char) else: even_page_chars.append(char) words_length = len(odd_page_chars) if words_length == 0: even_page_line = line elif len(even_page_chars) == 0: odd_page_line = line else: odd_page_location = get_line_from_chars(odd_page_chars) even_page_location = get_line_from_chars(even_page_chars) for i in range(words_length): words_pattern += char_pattern words_pattern = re.compile(words_pattern) # ubuntu上有问题 match = words_pattern.match(line['words']) odd_page_words = match[0] even_page_words = line['words'][match.end():] odd_page_line = {'chars': odd_page_chars, 'location': odd_page_location, 'words': odd_page_words} if even_page_words: even_page_line = {'chars': even_page_chars, 'location': even_page_location, 'words': even_page_words} return odd_page_line, even_page_line def get_double_page_text(words_result, split_position): # 把文本按双页分割 odd_page = [] even_page = [] for line in words_result: if line['location']['left'] + line['location']['width'] // 2 >= split_position: even_page.append(line) else: odd_page.append(line) # else: # odd_page_line, even_page_line = split_line_for_double_pages(line, split_position) # if odd_page_line: # odd_page.append(odd_page_line) # if even_page_line: # even_page.append(even_page_line) return [odd_page, even_page] # def get_double_page_text(words_result, split_position): # odd_page = [] # even_page = [] # for line in words_result: # odd_page_chars = [] # even_page_chars = [] # for char in line['chars']: # center = char['location']['left'] + char['location']['width'] // 2 # if center <= split_position: # odd_page_chars.append(char) # else: # even_page_chars.append(char) # line_result = get_line_from_chars(odd_page_chars) # if line_result: # odd_page.append(line_result) # line_result = get_line_from_chars(even_page_chars) # if line_result: # even_page.append(line_result) # return [odd_page, even_page] def get_page_text(words_result, image): # 除去密封线,分页,获取页面文本结果 left_seal_line = check_seal_line(words_result, image, type='left') if left_seal_line: words_result = get_double_page_text(words_result, left_seal_line)[1] right_seal_line = check_seal_line(words_result, image, type='right') if right_seal_line: words_result = get_double_page_text(words_result, right_seal_line)[0] split_position = check_double_page(words_result, image) if split_position: return get_double_page_text(words_result, split_position) else: return [words_result] def exam_segment(words_result): # 分割试卷区域 numbers = get_number_position(words_result) number_list = get_number_list(numbers) group_list = get_problem_list(number_list) for i in range(len(group_list)-1): group_list[i].update(end_line=group_list[i+1]['line']-1) if len(group_list) >= 1: group_list[-1].update(end_line=len(words_result)-1) for g in group_list: ymin = g['location']['top'] ymax = words_result[g['end_line']]['location']['top'] + words_result[g['end_line']]['location']['height'] xmin = g['location']['left'] xmax = g['location']['left'] + g['location']['width'] for line in range(g['line'], g['end_line']+1): left = words_result[line]['location']['left'] width = words_result[line]['location']['width'] if xmin > left: xmin = left if xmax < left + width: xmax = left + width g.update(box={'xmin': xmin, 'ymin': ymin, 'xmax': xmax, 'ymax': ymax}) return group_list def show_result(img_file, debug=1): image_color = cv2.imread(img_file) image = cv2.cvtColor(image_color, cv2.COLOR_BGR2GRAY) height = image.shape[0] width = image.shape[1] resp = get_respond_from_json(img_file.replace('.jpg', '_json.txt')) words_result = resp['words_result'] print('**********{}*********'.format(os.path.split(img_file)[1])) numbers = get_number_position(words_result) number_list = get_number_list(numbers) problem_list = get_problem_list(number_list) group_list = exam_segment(words_result) #double_page_numbers = get_double_page_number(words_result, img.shape[1]) if debug == 0: for line_index in range(len(words_result)): line = words_result[line_index] print('**************************************') print(line['words']) print('************All Numbers************') for num in numbers: print(num) for numbers in number_list: print('*******Number List********') for n in numbers: print(n) elif debug == 1: print('**********Problem List*********') for p in problem_list: print(p) print('**********Group List**********') for g in group_list: print(g) elif debug == 2: gap = 20 middle_word_count = word_projection(words_result, (height, width), left_ratio=0.4, right_ratio=0.6, gap=gap) left_word_count = word_projection(words_result, (height, width), left_ratio=0, right_ratio=0.15, gap=gap) right_word_count = word_projection(words_result, (height, width), left_ratio=0.85, right_ratio=1, gap=gap) left_image_projection = image_projection(image, left_ratio=0, right_ratio=0.15, gap=gap) middle_image_projection = image_projection(image, left_ratio=0.4, right_ratio=0.6, gap=gap) right_image_projection = image_projection(image, left_ratio=0.85, right_ratio=1, gap=gap) print('**********Left Projection************') print(left_word_count) print(left_image_projection) #print(get_longest_sequence(left_word_count[0, :], 2)) #print(get_longest_sequence(left_image_projection[0, :], 100, type='h')) print('**********Middle Projection************') print(middle_word_count) print(middle_image_projection) print('**********Right Projection************') print(right_word_count) print(right_image_projection) print('************Split Line****************') left_p = check_seal_line(words_result, image, type='left') right_p = check_seal_line(words_result, image, type='right') middle_p = check_double_page(words_result, image) print(left_p, middle_p, right_p) cv2.line(image_color, (left_p, 0), (left_p, height), (0, 0, 255), 5) cv2.line(image_color, (middle_p, 0), (middle_p, height), (0, 255, 0), 5) cv2.line(image_color, (right_p, 0), (right_p, height), (255, 0, 0), 5) cv2.namedWindow('image', cv2.WINDOW_NORMAL) cv2.imshow('image', image_color) if cv2.waitKey(0) == 27: # press ESC to exit exit(0) cv2.destroyAllWindows() elif debug == 3: page_text = get_page_text(words_result, image) if len(page_text) == 1: print('*************Single Page*********') for line in page_text[0]: print(line['words']) else: print('*************Odd Page**********') for line in page_text[0]: print(line['words']) print('************Even Page**********') for line in page_text[1]: print(line['words']) # elif style == 4: # print('***********Page Text***********') # page_result = get_page_text(words_result, image) # if len(page_result) == 1: # print('***********Single Page***********') # for line in page_result[0]: # print(line['words']) # elif len(page_result) == 2: # print('*********Odd************') # for line in page_result[0]: # print(line['words']) # print('********Even************') # for line in page_result[1]: # print(line['words']) # if __name__ == "__main__": # img_file = r'E:\data\test-problems\10.jpg' # # show_result(img_file, debug=2) # image_color = cv2.imread(img_file) # image = cv2.cvtColor(image_color, cv2.COLOR_BGR2GRAY) # height = image.shape[0] # width = image.shape[1] # resp = get_respond_from_json(img_file.replace('.jpg', '_json.txt')) # words_result = resp['words_result'] # print('**********{}*********'.format(os.path.split(img_file)[1])) # text_list = get_page_text(words_result, image) # # # work_dir = r'E:\data\seal_line' # # for img_file in glob.glob(os.path.join(work_dir, '*.jpg')): # # show_result(img_file, style=2)