123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538 |
- import re
- import json
- import glob
- import os
- import cv2
- import numpy as np
- import matplotlib.pyplot as plt
- problem_number_pattern = re.compile(r'\s*(\d+)')
- number_pattern = re.compile(r'(\d+)')
- sub_problem_number_pattern = re.compile(r'\s*\((\d+)')
- max_number = 99 # 最大题目数
- min_number = 0 # 最小题目数
- def get_respond_from_json(json_file):
- with open(json_file, 'r', encoding='UTF-8') as f:
- resp = json.load(f)
- return resp
- def get_number_position(words_result, max_number=max_number, left_position=0, right_position=0):
- # 获取以数字开头的位置, 保留num<=max_number以及字符位置位于[left_position,right_position]的数
- numbers = []
- for line_index in range(len(words_result)):
- line = words_result[line_index]
- #print('**************************************')
- #print(line['words'])
- #print(line['chars'][:2])
- m = problem_number_pattern.match(line['words'])
- if m:
- location = line['chars'][m.start(1)]['location']
- number = line['words'][m.start(1):m.end(1)]
- center = location['left'] + location['width'] // 2
- if int(number) <= max_number and center >= left_position:
- if right_position == 0:
- numbers.append(
- {'number': number, 'center': center, 'line': line_index, 'location': line['location']})
- elif center <= right_position:
- numbers.append(
- {'number': number, 'center': center, 'line': line_index, 'location': line['location']})
- #print(number, center, location)
- #print(line['chars'][m.start(1)])
- return numbers
- def get_number_list(numbers, shift_limit=50):
- # 获取横坐标相近的数字序列
- number_list = []
- for number in numbers:
- not_found_flag = 1
- for single_list in number_list:
- if abs(number['center']-single_list[-1]['center']) <= shift_limit:
- single_list.append(number)
- not_found_flag = 0
- #break
- if not_found_flag:
- single_list = []
- single_list.append(number)
- number_list.append(single_list)
- return number_list
- def get_longest_sequence(sequence, limit, type='l'):
- # 获取limit之下或之上的最长连续序列
- flag = [[0, 0], [0, 0]]
- for i in range(len(sequence)):
- if type == 'l':
- f = sequence[i] <= limit
- elif type == 'h':
- f = sequence[i] >= limit
- if f:
- if i == flag[1][1]:
- flag[1][1] += 1
- else:
- if flag[1][1] - flag[1][0] > flag[0][1] - flag[0][0]:
- flag[0][:] = flag[1][:]
- flag[1][:] = [i, i + 1]
- if flag[1][1] - flag[1][0] > flag[0][1] - flag[0][0]:
- flag[0][:] = flag[1][:]
- return flag[0][:]
- def get_number_sequence(numbers, max_gap=5, min_number=min_number):
- # 数列连续性判断
- number_sequence = []
- return number_sequence
- def get_problem_list(number_list):
- # 选取题号序列
- # rule1: 横坐标最小
- # rule2: 序列连续性?
- # rule3: 整体题号连续性?
- if number_list:
- index = 0
- left = number_list[index][0]['center']
- else:
- return []
- for i in range(1, len(number_list)):
- if number_list[i][0]['center'] < left:
- index = i
- left = number_list[i][0]['center']
- return number_list[index]
- def get_double_page_number(words_result, img_width, left_ratio=0.4, right_ratio=0.6):
- left = int(left_ratio * img_width)
- right = int(right_ratio * img_width)
- numbers = []
- for line in words_result:
- for char in line['chars']:
- center = int(char['location']['left']) + int(char['location']['width']) // 2
- if number_pattern.match(char['char']) and left <= center <= right:
- char.update(center=center)
- numbers.append(char)
- double_page_numbers = get_number_list(numbers)
- for d in double_page_numbers:
- if len(d) >= 2:
- return True, double_page_numbers
- return double_page_numbers
- def image_projection(image, left_ratio, right_ratio, top_ratio=0.2, bottom_ratio=0.9, gap=20):
- # 图像投影projection = [counts, positions]
- image = np.asarray(image)
- image = 255 - image
- height = image.shape[0]
- width = image.shape[1]
- top = int(height * top_ratio)
- bottom = int(height * bottom_ratio)
- left = int(width * left_ratio)
- right = int(width * right_ratio)
- # col_num = (right - left + 1) // gap
- # right = left + col_num * gap
- projection = np.zeros((2, len(range(left, right-gap, gap))), dtype=np.int)
- projection[1, :] = np.asarray(range(left, right-gap, gap), dtype=np.int)
- projection[0, :] = np.sum(np.sum(np.hsplit(
- image[top:bottom, left:projection[1, -1]+gap], projection.shape[1]), axis=1), axis=1) // (bottom - top)
- return projection
- def word_projection(words_result, image_shape, left_ratio, right_ratio, top_ratio=0.2, bottom_ratio=0.9, gap=20):
- # 字符投影word_count = [counts, positions]
- height = image_shape[0]
- width = image_shape[1]
- left = int(width * left_ratio)
- right = int(width * right_ratio)
- top = int(height * top_ratio)
- bottom = int(height * bottom_ratio)
- word_count = np.zeros((2, len(range(left, right-gap, gap))), dtype=np.int)
- word_count[1, :] = np.asarray(range(left, right-gap, gap), dtype=np.int)
- for line in words_result:
- if top < line['location']['top'] < bottom:
- for char in line['chars']:
- center = char['location']['left'] + char['location']['width'] // 2
- for i in range(word_count.shape[1]):
- if 0 <= center - word_count[1, i] < gap:
- word_count[0, i] += 1
- return word_count
- def check_seal_line(words_result, image, type='left', gap=20):
- # 检查是否有密封线,返回密封线横坐标
- projection_limit = 80
- wc_limit = 0
- seal_limit = 3
- image = np.asarray(image)
- height, width = image.shape[:2]
- if height / width < 1:
- if type == 'left':
- # 检查左密封线
- length_limit = 5
- left_ratio = 0
- right_ratio = 0.15
- word_count = word_projection(
- words_result, (height, width), left_ratio=left_ratio, right_ratio=right_ratio, gap=gap)
- image_count = image_projection(image, left_ratio=left_ratio, right_ratio=right_ratio, gap=gap)
- seal_flag = np.sum(image_count[0, :length_limit] > projection_limit)
- if seal_flag < seal_limit:
- # 判定无密封线
- return 0
- else:
- # 获取数字开头的位置
- numbers = get_number_position(
- words_result, left_position=length_limit*gap, right_position=right_ratio*width)
- right_flag = right_ratio * width
- for number in numbers:
- right_flag = min(right_flag, number['center'])
- for i in range(word_count.shape[1]-1, -1, -1):
- if word_count[0, i] <= wc_limit:
- if length_limit*gap <= word_count[1, i] <= right_flag:
- return word_count[1, i]
- return length_limit * gap
- elif type == 'right':
- # 检查右密封线
- left_ratio = 0.85
- right_ratio = 1
- word_count = word_projection(words_result, (height, width), left_ratio=left_ratio, right_ratio=right_ratio)
- image_count = image_projection(image, left_ratio=left_ratio, right_ratio=right_ratio)
- # seal_flag = np.sum(image_count[0, -length_limit:] > projection_limit)
- # if seal_flag < seal_limit:
- # return 0
- # else:
- # for i in range(word_count.shape[1]-length_limit, -1, -1):
- # if word_count[0, i] > wc_limit and image_count[0, i] <= projection_limit:
- # return word_count[1, i] + 2 * gap
- # return width - length_limit * gap
- for i in range(word_count.shape[1]-1, -1, -1):
- if word_count[0, i] > wc_limit:
- if image_count[0, i-1] <= projection_limit and word_count[0, i-1] + word_count[0, i-2] > 0:
- return word_count[1, i-1] + gap
- return 0
- else:
- return 0
- def check_double_page(words_result, image, height_to_width_ratio=1, wc_limit=2):
- # 检查是否有分页, 返回分割线横坐标
- image = np.asarray(image)
- height = image.shape[0]
- width = image.shape[1]
- flag = [[0, 0], [0, 0]]
- if height / width < height_to_width_ratio:
- word_count = word_projection(words_result, (height, width), left_ratio=0.4, right_ratio=0.6)
- for i in range(word_count.shape[1]):
- if word_count[0, i] <= wc_limit:
- if i == flag[1][1]:
- flag[1][1] += 1
- else:
- if flag[1][1] - flag[1][0] > flag[0][1] - flag[0][0]:
- flag[0][:] = flag[1][:]
- flag[1][:] = [i, i+1]
- if flag[1][1] - flag[1][0] > flag[0][1] - flag[0][0]:
- return word_count[1, (flag[1][0]+flag[1][1])//2]
- elif flag[0][1]:
- return word_count[1, (flag[0][0] + flag[0][1]) // 2]
- else:
- return 0
- return 0
- # for i in range(word_count.shape[1]//2):
- # kplus = word_count.shape[1]//2 + i
- # kminus = word_count.shape[1]//2 - i
- # if word_count[0, kplus] <= wc_limit:
- # return word_count[1, kplus]
- # elif word_count[0, kminus] <= wc_limit:
- # return word_count[1, kminus]
- # return 0
- def get_line_from_chars(chars):
- # 从一行所有字符获取行的整体坐标
- if chars:
- xmin = chars[0]['location']['left']
- ymin = chars[0]['location']['top']
- xmax = chars[0]['location']['left'] + chars[0]['location']['width']
- ymax = chars[0]['location']['top'] + chars[0]['location']['height']
- for char in chars:
- if xmin > char['location']['left']:
- xmin = char['location']['left']
- if ymin > char['location']['top']:
- ymin = char['location']['top']
- if xmax < char['location']['left'] + char['location']['width']:
- xmax = char['location']['left'] + char['location']['width']
- if ymax < char['location']['top'] + char['location']['height']:
- ymax = char['location']['top'] + char['location']['height']
- result = {'width': xmax-xmin, 'top': ymin, 'left': xmin, 'height': ymax-ymin}
- return result
- else:
- return {}
- def get_box_from_lines(lines):
- # 获取包含所有行区域的整体坐标
- if lines:
- ymin = lines[0]['location']['top']
- ymax = lines[0]['location']['top'] + lines[0]['location']['height']
- xmin = lines[0]['location']['left']
- xmax = lines[0]['location']['left'] + lines[0]['location']['width']
- for line in lines:
- if xmin > line['location']['left']:
- xmin = line['location']['left']
- if ymin > line['location']['top']:
- ymin = line['location']['top']
- if xmax < line['location']['left'] + line['location']['width']:
- xmax = line['location']['left'] + line['location']['width']
- if ymax < line['location']['top'] + line['location']['height']:
- ymax = line['location']['top'] + line['location']['height']
- return {'xmin': xmin, 'ymin': ymin, 'xmax': xmax, 'ymax': ymax}
- else:
- return {}
- def split_line_for_double_pages(line, split_position):
- # 把单行按双页分割
- char_pattern = r'\s*\S'
- words_pattern = r''
- odd_page_line = {}
- even_page_line = {}
- odd_page_chars = []
- even_page_chars = []
- for char in line['chars']:
- center = char['location']['left'] + char['location']['width'] // 2
- if center <= split_position:
- odd_page_chars.append(char)
- else:
- even_page_chars.append(char)
- words_length = len(odd_page_chars)
- if words_length == 0:
- even_page_line = line
- elif len(even_page_chars) == 0:
- odd_page_line = line
- else:
- odd_page_location = get_line_from_chars(odd_page_chars)
- even_page_location = get_line_from_chars(even_page_chars)
- for i in range(words_length):
- words_pattern += char_pattern
- words_pattern = re.compile(words_pattern) # ubuntu上有问题
- match = words_pattern.match(line['words'])
- odd_page_words = match[0]
- even_page_words = line['words'][match.end():]
- odd_page_line = {'chars': odd_page_chars, 'location': odd_page_location, 'words': odd_page_words}
- if even_page_words:
- even_page_line = {'chars': even_page_chars, 'location': even_page_location, 'words': even_page_words}
- return odd_page_line, even_page_line
- def get_double_page_text(words_result, split_position):
- # 把文本按双页分割
- odd_page = []
- even_page = []
- for line in words_result:
- if line['location']['left'] + line['location']['width'] // 2 >= split_position:
- even_page.append(line)
- else:
- odd_page.append(line)
- # else:
- # odd_page_line, even_page_line = split_line_for_double_pages(line, split_position)
- # if odd_page_line:
- # odd_page.append(odd_page_line)
- # if even_page_line:
- # even_page.append(even_page_line)
- return [odd_page, even_page]
- # def get_double_page_text(words_result, split_position):
- # odd_page = []
- # even_page = []
- # for line in words_result:
- # odd_page_chars = []
- # even_page_chars = []
- # for char in line['chars']:
- # center = char['location']['left'] + char['location']['width'] // 2
- # if center <= split_position:
- # odd_page_chars.append(char)
- # else:
- # even_page_chars.append(char)
- # line_result = get_line_from_chars(odd_page_chars)
- # if line_result:
- # odd_page.append(line_result)
- # line_result = get_line_from_chars(even_page_chars)
- # if line_result:
- # even_page.append(line_result)
- # return [odd_page, even_page]
- def get_page_text(words_result, image):
- # 除去密封线,分页,获取页面文本结果
- left_seal_line = check_seal_line(words_result, image, type='left')
- if left_seal_line:
- words_result = get_double_page_text(words_result, left_seal_line)[1]
- right_seal_line = check_seal_line(words_result, image, type='right')
- if right_seal_line:
- words_result = get_double_page_text(words_result, right_seal_line)[0]
- split_position = check_double_page(words_result, image)
- if split_position:
- return get_double_page_text(words_result, split_position)
- else:
- return [words_result]
- def exam_segment(words_result):
- # 分割试卷区域
- numbers = get_number_position(words_result)
- number_list = get_number_list(numbers)
- group_list = get_problem_list(number_list)
- for i in range(len(group_list)-1):
- group_list[i].update(end_line=group_list[i+1]['line']-1)
- if len(group_list) >= 1:
- group_list[-1].update(end_line=len(words_result)-1)
- for g in group_list:
- ymin = g['location']['top']
- ymax = words_result[g['end_line']]['location']['top'] + words_result[g['end_line']]['location']['height']
- xmin = g['location']['left']
- xmax = g['location']['left'] + g['location']['width']
- for line in range(g['line'], g['end_line']+1):
- left = words_result[line]['location']['left']
- width = words_result[line]['location']['width']
- if xmin > left:
- xmin = left
- if xmax < left + width:
- xmax = left + width
- g.update(box={'xmin': xmin, 'ymin': ymin, 'xmax': xmax, 'ymax': ymax})
- return group_list
- def show_result(img_file, debug=1):
- image_color = cv2.imread(img_file)
- image = cv2.cvtColor(image_color, cv2.COLOR_BGR2GRAY)
- height = image.shape[0]
- width = image.shape[1]
- resp = get_respond_from_json(img_file.replace('.jpg', '_json.txt'))
- words_result = resp['words_result']
- print('**********{}*********'.format(os.path.split(img_file)[1]))
- numbers = get_number_position(words_result)
- number_list = get_number_list(numbers)
- problem_list = get_problem_list(number_list)
- group_list = exam_segment(words_result)
- #double_page_numbers = get_double_page_number(words_result, img.shape[1])
- if debug == 0:
- for line_index in range(len(words_result)):
- line = words_result[line_index]
- print('**************************************')
- print(line['words'])
- print('************All Numbers************')
- for num in numbers:
- print(num)
- for numbers in number_list:
- print('*******Number List********')
- for n in numbers:
- print(n)
- elif debug == 1:
- print('**********Problem List*********')
- for p in problem_list:
- print(p)
- print('**********Group List**********')
- for g in group_list:
- print(g)
- elif debug == 2:
- gap = 20
- middle_word_count = word_projection(words_result, (height, width), left_ratio=0.4, right_ratio=0.6, gap=gap)
- left_word_count = word_projection(words_result, (height, width), left_ratio=0, right_ratio=0.15, gap=gap)
- right_word_count = word_projection(words_result, (height, width), left_ratio=0.85, right_ratio=1, gap=gap)
- left_image_projection = image_projection(image, left_ratio=0, right_ratio=0.15, gap=gap)
- middle_image_projection = image_projection(image, left_ratio=0.4, right_ratio=0.6, gap=gap)
- right_image_projection = image_projection(image, left_ratio=0.85, right_ratio=1, gap=gap)
- print('**********Left Projection************')
- print(left_word_count)
- print(left_image_projection)
- #print(get_longest_sequence(left_word_count[0, :], 2))
- #print(get_longest_sequence(left_image_projection[0, :], 100, type='h'))
- print('**********Middle Projection************')
- print(middle_word_count)
- print(middle_image_projection)
- print('**********Right Projection************')
- print(right_word_count)
- print(right_image_projection)
- print('************Split Line****************')
- left_p = check_seal_line(words_result, image, type='left')
- right_p = check_seal_line(words_result, image, type='right')
- middle_p = check_double_page(words_result, image)
- print(left_p, middle_p, right_p)
- cv2.line(image_color, (left_p, 0), (left_p, height), (0, 0, 255), 5)
- cv2.line(image_color, (middle_p, 0), (middle_p, height), (0, 255, 0), 5)
- cv2.line(image_color, (right_p, 0), (right_p, height), (255, 0, 0), 5)
- cv2.namedWindow('image', cv2.WINDOW_NORMAL)
- cv2.imshow('image', image_color)
- if cv2.waitKey(0) == 27: # press ESC to exit
- exit(0)
- cv2.destroyAllWindows()
- elif debug == 3:
- page_text = get_page_text(words_result, image)
- if len(page_text) == 1:
- print('*************Single Page*********')
- for line in page_text[0]:
- print(line['words'])
- else:
- print('*************Odd Page**********')
- for line in page_text[0]:
- print(line['words'])
- print('************Even Page**********')
- for line in page_text[1]:
- print(line['words'])
- # elif style == 4:
- # print('***********Page Text***********')
- # page_result = get_page_text(words_result, image)
- # if len(page_result) == 1:
- # print('***********Single Page***********')
- # for line in page_result[0]:
- # print(line['words'])
- # elif len(page_result) == 2:
- # print('*********Odd************')
- # for line in page_result[0]:
- # print(line['words'])
- # print('********Even************')
- # for line in page_result[1]:
- # print(line['words'])
- # if __name__ == "__main__":
- # img_file = r'E:\data\test-problems\10.jpg'
- # # show_result(img_file, debug=2)
- # image_color = cv2.imread(img_file)
- # image = cv2.cvtColor(image_color, cv2.COLOR_BGR2GRAY)
- # height = image.shape[0]
- # width = image.shape[1]
- # resp = get_respond_from_json(img_file.replace('.jpg', '_json.txt'))
- # words_result = resp['words_result']
- # print('**********{}*********'.format(os.path.split(img_file)[1]))
- # text_list = get_page_text(words_result, image)
- #
- # # work_dir = r'E:\data\seal_line'
- # # for img_file in glob.glob(os.path.join(work_dir, '*.jpg')):
- # # show_result(img_file, style=2)
|