# -*- coding:utf-8 -*- import numpy as np import pandas as pd from copy import deepcopy import numba as nb @nb.njit def _find_right(boxes, i, j, find_range): for xs in range(int(boxes[i][2]), int(boxes[i][2] + find_range)): for ys in range(boxes[i][1] + 1, boxes[i][3] - 1): if boxes[j][0] < xs < boxes[j][2] and boxes[j][1] < ys < boxes[j][3]: boxes[i][0] = min(boxes[i][0], boxes[j][0]) boxes[i][1] = min(boxes[i][1], boxes[j][1]) boxes[i][2] = max(boxes[i][2], boxes[j][2]) boxes[i][3] = max(boxes[i][3], boxes[j][3]) boxes[j] = np.array([0, 0, 0, 0, 10]) return xs - boxes[i][2] @nb.njit def _find_down(boxes, y_max, i, j): for xs in range(boxes[i][0], boxes[i][2]): for ys in range(boxes[i][3], y_max): if boxes[j][0] < xs < boxes[j][2] and boxes[j][1] < ys < boxes[j][2]: boxes[i][0] = min(boxes[i][0], boxes[j][0]) boxes[i][1] = min(boxes[i][1], boxes[j][1]) boxes[i][2] = max(boxes[i][2], boxes[j][2]) boxes[i][3] = max(boxes[i][3], boxes[j][3]) return 1 @nb.njit def find_in(in_box, out_box, i, j, find_range=0): # if all(in_box[i] == out_box[j] ): # # return # x_min, y_min, x_max, y_max = 0, 1, 2, 3 # if in_box[i][x_max] <= out_box[j][x_max] and in_box[i][x_min] >= out_box[j][x_min]: # overlap_w = in_box[i] # elif in_box[i][x_max] >= out_box[j][x_max] and in_box[i][x_min] >= out_box[j][x_min]: # overlap_w = in_box[i] # # # if in_box[i][x_max] <= out_box[j][x_max] and \ # in_box[i][y_max] <= out_box[j][y_max] and \ # in_box[i][x_min] > out_box[j][x_min] and \ # in_box[i][y_min] < out_box[j][y_min]+11: # if find_range==888: # print('********************') # print(in_box[i]) # out_box[j][0] = min(out_box[j][0], in_box[i][0]) # out_box[j][1] = min(out_box[j][1], in_box[i][1]) # out_box[j][2] = max(out_box[j][2], in_box[i][2]) # out_box[j][3] = max(out_box[j][3], in_box[i][3]) # in_box[i] = np.array([0, 0, 0, 0, 10]) # return 1 for xs in range(in_box[i][0] - find_range, in_box[i][2] + find_range): for ys in range(in_box[i][1] - find_range, in_box[i][3] + find_range): if out_box[j][0] <= xs <= out_box[j][2] and out_box[j][1] <= ys <= out_box[j][3]: out_box[j][0] = min(out_box[j][0], in_box[i][0]) out_box[j][1] = min(out_box[j][1], in_box[i][1]) out_box[j][2] = max(out_box[j][2], in_box[i][2]) out_box[j][3] = max(out_box[j][3], in_box[i][3]) in_box[i] = np.array([0, 0, 0, 0, 10]) return 1 @nb.njit def find_right(the_boxes, unknow_box, i, j, find_range): for xs in range(the_boxes[i][2], the_boxes[i][2] + int(find_range)): for ys in range(the_boxes[i][1], the_boxes[i][3]): if unknow_box[j][0] < xs < unknow_box[j][2] and unknow_box[j][1] < ys < unknow_box[j][3]: the_boxes[i][0] = min(the_boxes[i][0], unknow_box[j][0]) the_boxes[i][1] = min(the_boxes[i][1], unknow_box[j][1]) the_boxes[i][2] = max(the_boxes[i][2], unknow_box[j][2]) the_boxes[i][3] = max(the_boxes[i][3], unknow_box[j][3]) unknow_box[j] = np.array([0, 0, 0, 0, 10]) return xs - the_boxes[i][2] @nb.njit def find_left(the_boxes, unknow_box, i, j, find_range): for xs in range(the_boxes[i][0], max(int(the_boxes[i][0] - find_range), 0), -1): for ys in range(the_boxes[i][1], the_boxes[i][3]): if unknow_box[j][0] < xs < unknow_box[j][2] and unknow_box[j][1] < ys < unknow_box[j][3]: the_boxes[i][0] = min(the_boxes[i][0], unknow_box[j][0]) the_boxes[i][1] = min(the_boxes[i][1], unknow_box[j][1]) the_boxes[i][2] = max(the_boxes[i][2], unknow_box[j][2]) the_boxes[i][3] = max(the_boxes[i][3], unknow_box[j][3]) unknow_box[j] = np.array([0, 0, 0, 0, 10]) return 1 @nb.njit def find_down(the_boxes, unknow_box, i, j, find_range): for xs in range(int(unknow_box[j][0]), int(unknow_box[j][2])): for ys in range(int(unknow_box[j][3]), int(unknow_box[j][3] + find_range)): if the_boxes[i][0] < xs < the_boxes[i][2] and the_boxes[i][1] < ys < the_boxes[i][3]: the_boxes[i][0] = min(the_boxes[i][0], unknow_box[j][0]) the_boxes[i][1] = min(the_boxes[i][1], unknow_box[j][1]) the_boxes[i][2] = max(the_boxes[i][2], unknow_box[j][2]) the_boxes[i][3] = max(the_boxes[i][3], unknow_box[j][3]) unknow_box[j] = np.array([0, 0, 0, 0, 10]) return ys - unknow_box[j][3] @nb.njit def find_top(the_boxes, unknow_box, i, j, find_range): for xs in range(int(unknow_box[j][0]), int(unknow_box[j][2])): for ys in range(int(unknow_box[j][1]), int(unknow_box[j][1] - find_range), -1): if the_boxes[i][0] < xs < the_boxes[i][2] and the_boxes[i][1] < ys < the_boxes[i][3]: the_boxes[i][0] = min(the_boxes[i][0], unknow_box[j][0]) the_boxes[i][1] = min(the_boxes[i][1], unknow_box[j][1]) the_boxes[i][2] = max(the_boxes[i][2], unknow_box[j][2]) the_boxes[i][3] = max(the_boxes[i][3], unknow_box[j][3]) unknow_box[j] = np.array([0, 0, 0, 0, 10]) return unknow_box[j][3] - ys def neighbor_change(tex_box, im_box): """ 后期处理 临近合并 :param tex_box: 文字区域 :param im_box: 未知区域 :return: """ # return tex_box, im_box text_box, image_box, unknow_box, small_text = [], [], [], [] tex_box_df = pd.DataFrame(tex_box) height = tex_box_df[3] - tex_box_df[1] mix = height.median() min_xs = 1.2 * mix ** 2 img_w = int(tex_box_df[2].max()) for i in tex_box: if (i[2] - i[0]) * (i[3] - i[1]) < min_xs and False: small_text.append(np.array(i)) unknow_box.append(np.array(i)) else: text_box.append(i) # 分开大图和小图 for i in im_box: if (i[2] - i[0]) * (i[3] - i[1]) < min_xs: unknow_box.append(i) else: image_box.append(i) if len(image_box): image_box = pd.DataFrame(image_box) image_box[4] = 1 image_box = image_box.sort_values(by=0).astype(np.int).values # print(unknow_box) if len(unknow_box) > 0: unknow_box = pd.DataFrame(unknow_box) unknow_box[4] = 2 unknow_box = unknow_box.sort_values(by=0).astype(np.int).values tex_box = pd.DataFrame(text_box) tex_box[4] = 1 tex_box = tex_box.sort_values(by=0).astype(np.int).values # 小图片向上粘贴 for i in range(len(image_box)): for j in range(len(unknow_box)): find_top(image_box, unknow_box, i, j, mix // 2) unknow_box = np.array([i for i in unknow_box if i[0]]) # 小图片向下粘贴 for i in range(len(image_box)): for j in range(len(unknow_box)): find_down(image_box, unknow_box, i, j, mix // 2) unknow_box = np.array([i for i in unknow_box if i[0]]) # 小图片向左粘贴 for i in range(len(image_box)): for j in range(len(unknow_box)): find_left(image_box, unknow_box, i, j, mix // 2) unknow_box = np.array([i for i in unknow_box if i[0]]) # 小图片向右粘贴 for i in range(len(image_box)): for j in range(len(unknow_box)): find_right(image_box, unknow_box, i, j, mix // 2) unknow_box = np.array([i for i in unknow_box if i[0]]) # 消除在imagebox里的textbox # for i in range(len(tex_box)): # for j in range(len(image_box)): # __find_in(tex_box, image_box, i, j) for i in range(len(small_text)): for j in range(len(image_box)): if not all(small_text[i] == image_box[j]): find_in(small_text, image_box, i, j) # textbox向右寻找 for i in range(len(tex_box)): for j in range(i + 1, len(tex_box)): _find_right(tex_box, i, j, mix) tex_box = np.array([i for i in tex_box if i[0]]) text_box = [] for i in tex_box: if 1 < (i[2] - i[0]) * (i[3] - i[1]) < min_xs: small_text.append(np.array(i)) else: text_box.append(i) small_text = np.array([i for i in small_text if i[0]]) tex_box = pd.DataFrame(text_box) tex_box[4] = 1 tex_box = tex_box.sort_values(by=0).astype(np.int).values if len(small_text) > 0: small_text = pd.DataFrame(small_text) small_text[4] = 3 small_text = small_text.sort_values(by=0).astype(np.int).values # textbox向左合并小图 for i in range(len(tex_box)): if (tex_box[i][2] - tex_box[i][0]) * (tex_box[i][3] - tex_box[i][1]) > 5 * min_xs: for j in range(len(unknow_box)): find_left(tex_box, unknow_box, i, j, mix) for i in range(len(tex_box)): if (tex_box[i][2] - tex_box[i][0]) * (tex_box[i][3] - tex_box[i][1]) > 5 * min_xs: for j in range(len(small_text)): find_left(tex_box, small_text, i, j, 5 * mix) # textbox向下寻找 for i in range(len(tex_box)): for j in range(len(unknow_box)): find_down(tex_box, unknow_box, i, j, mix // 3) unknow_box = np.array([i for i in unknow_box if i[0]]) for i in range(len(tex_box)): for j in range(len(small_text)): find_down(tex_box, small_text, i, j, mix // 3) small_text = np.array([i for i in small_text if i[0]]) # textbox向上寻找 for i in range(len(tex_box)): for j in range(len(unknow_box)): find_top(tex_box, unknow_box, i, j, mix // 3) for i in range(len(tex_box)): for j in range(len(small_text)): find_top(tex_box, small_text, i, j, mix // 3) image_box = np.array([i for i in image_box if i[0]]) tex_box = np.array([i for i in tex_box if i[0]]) # image_box_bk = deepcopy(image_box) # 消除内部imagebox for i in range(len(image_box)): for j in range(i + 1, len(image_box)): if not all(image_box[i] == image_box[j]): find_in(image_box, image_box, i, j, 0) # 消除内部textbox for i in range(len(tex_box)): for j in range(i + 1, len(tex_box)): if i != j: find_in(tex_box, tex_box, i, j, -int(mix // 5)) # text_box_p = [i[:4] for i in tex_box]# + [i[:4] for i in small_text] # image_box_p = [i[:4] for i in image_box] # # return text_box_p, image_box_p # # for i in range(len(tex_box)): # for j in range(i+1,len(tex_box)): # __find_in(tex_box, tex_box, i, j, -int(mix//10)) for i in range(len(small_text)): for j in range(len(tex_box)): if not all(small_text[i] == tex_box[j]): find_in(small_text, tex_box, i, j, 0) for i in range(len(tex_box)): for j in range(len(image_box)): if not all(tex_box[i] == image_box[j]): find_in(tex_box, image_box, i, j, 0) for i in range(len(small_text)): for j in range(len(image_box)): if not all(small_text[i] == image_box[j]): find_in(small_text, image_box, i, j, 0) text_box_p = [i[:4] for i in tex_box if np.sum(i[:4])] + [i[:4] for i in small_text if np.sum(i[:4])] image_box_p = [i[:4] for i in image_box if np.sum(i[:4])] return text_box_p, image_box_p class Neighbor: def __init__(self, tex_box, im_box): self.text_box, self.image_box, self.unknow_box = [], [], [] tex_box_df = pd.DataFrame(tex_box) height = tex_box_df[3] - tex_box_df[1] mix = height.median() min_xs = mix ** 2 for i in tex_box: if (i[2] - i[0]) * (i[3] - i[1]) < min_xs / 2000: self.unknow_box.append(np.array(i)) else: self.text_box.append(i) for i in im_box: if (i[2] - i[0]) * (i[3] - i[1]) < min_xs: self.unknow_box.append(i) else: self.image_box.append(i) if len(self.image_box): image_box = pd.DataFrame(self.image_box) image_box[4] = 1 self.image_box = image_box.sort_values(by=0).astype(np.int).values if len(self.unknow_box) > 0: unknow_box = pd.DataFrame(self.unknow_box) unknow_box[4] = 2 self.unknow_box = unknow_box.sort_values(by=0).astype(np.int).values if len(self.text_box) > 0: tex_box = pd.DataFrame(self.text_box) tex_box[4] = 1 self.tex_box = tex_box.sort_values(by=0).astype(np.int).values