123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287 |
- # -*- coding:utf-8 -*-
- import numpy as np
- import pandas as pd
- from copy import deepcopy
- from numba import njit
- def neighbor_change(tex_box, im_box):
- """
- 后期处理 临近合并
- :param tex_box: 文字区域
- :param im_box: 未知区域
- :return:
- """
- # return tex_box, im_box
- text_box, image_box, unknow_box, small_text = [], [], [], []
- tex_box_df = pd.DataFrame(tex_box)
- height = tex_box_df[3] - tex_box_df[1]
- mix = height.median()
- min_xs = 1.2*mix ** 2
- img_w = int(tex_box_df[2].max())
- for i in tex_box:
- if (i[2] - i[0]) * (i[3] - i[1]) < min_xs and False:
- small_text.append(np.array(i))
- unknow_box.append(np.array(i))
- else:
- text_box.append(i)
- # 分开大图和小图
- for i in im_box:
- if (i[2] - i[0]) * (i[3] - i[1]) < min_xs:
- unknow_box.append(i)
- else:
- image_box.append(i)
- if len(image_box):
- image_box = pd.DataFrame(image_box)
- image_box[4] = 1
- image_box = image_box.sort_values(by=0).astype(np.int).values
- # print(unknow_box)
- if len(unknow_box) > 0:
- unknow_box = pd.DataFrame(unknow_box)
- unknow_box[4] = 2
- unknow_box = unknow_box.sort_values(by=0).astype(np.int).values
- tex_box = pd.DataFrame(text_box)
- tex_box[4] = 1
- tex_box = tex_box.sort_values(by=0).astype(np.int).values
- # @njit
- def _find_right(boxes, i, j, find_range):
- for xs in range(int(boxes[i][2]), int(boxes[i][2] + find_range)):
- for ys in range(boxes[i][1]+1, boxes[i][3]-1):
- if boxes[j][0] < xs < boxes[j][2] and boxes[j][1] < ys < boxes[j][3]:
- boxes[i][0] = min(boxes[i][0], boxes[j][0])
- boxes[i][1] = min(boxes[i][1], boxes[j][1])
- boxes[i][2] = max(boxes[i][2], boxes[j][2])
- boxes[i][3] = max(boxes[i][3], boxes[j][3])
- boxes[j] = np.array([0, 0, 0, 0, 10])
- return xs - boxes[i][2]
- # @njit
- def __find_in(in_box, out_box, i, j, find_range=0):
- # if all(in_box[i] == out_box[j] ):
- #
- # return
- # x_min, y_min, x_max, y_max = 0, 1, 2, 3
- # if in_box[i][x_max] <= out_box[j][x_max] and in_box[i][x_min] >= out_box[j][x_min]:
- # overlap_w = in_box[i]
- # elif in_box[i][x_max] >= out_box[j][x_max] and in_box[i][x_min] >= out_box[j][x_min]:
- # overlap_w = in_box[i]
- #
- #
- # if in_box[i][x_max] <= out_box[j][x_max] and \
- # in_box[i][y_max] <= out_box[j][y_max] and \
- # in_box[i][x_min] > out_box[j][x_min] and \
- # in_box[i][y_min] < out_box[j][y_min]+11:
- # if find_range==888:
- # print('********************')
- # print(in_box[i])
- # out_box[j][0] = min(out_box[j][0], in_box[i][0])
- # out_box[j][1] = min(out_box[j][1], in_box[i][1])
- # out_box[j][2] = max(out_box[j][2], in_box[i][2])
- # out_box[j][3] = max(out_box[j][3], in_box[i][3])
- # in_box[i] = np.array([0, 0, 0, 0, 10])
- # return 1
- for xs in range(in_box[i][0]-find_range, in_box[i][2]+find_range):
- for ys in range(in_box[i][1]-find_range, in_box[i][3]+find_range):
- if out_box[j][0] <= xs <= out_box[j][2] and out_box[j][1] <= ys <= out_box[j][3]:
- out_box[j][0] = min(out_box[j][0], in_box[i][0])
- out_box[j][1] = min(out_box[j][1], in_box[i][1])
- out_box[j][2] = max(out_box[j][2], in_box[i][2])
- out_box[j][3] = max(out_box[j][3], in_box[i][3])
- in_box[i] = np.array([0, 0, 0, 0, 10])
- return 1
- # @njit
- def __find_right(the_boxes, unknow_box, i, j, find_range):
- for xs in range(the_boxes[i][2], the_boxes[i][2] + int(find_range)):
- for ys in range(the_boxes[i][1], the_boxes[i][3]):
- if unknow_box[j][0] < xs < unknow_box[j][2] and unknow_box[j][1] < ys < unknow_box[j][3]:
- the_boxes[i][0] = min(the_boxes[i][0], unknow_box[j][0])
- the_boxes[i][1] = min(the_boxes[i][1], unknow_box[j][1])
- the_boxes[i][2] = max(the_boxes[i][2], unknow_box[j][2])
- the_boxes[i][3] = max(the_boxes[i][3], unknow_box[j][3])
- unknow_box[j] = np.array([0, 0, 0, 0, 10])
- return xs - the_boxes[i][2]
- # @njit
- def __find_left(the_boxes, unknow_box, i, j, find_range):
- for xs in range(the_boxes[i][0], max(int(the_boxes[i][0] - find_range), 0), -1):
- for ys in range(the_boxes[i][1], the_boxes[i][3]):
- if unknow_box[j][0] < xs < unknow_box[j][2] and unknow_box[j][1] < ys < unknow_box[j][3]:
- the_boxes[i][0] = min(the_boxes[i][0], unknow_box[j][0])
- the_boxes[i][1] = min(the_boxes[i][1], unknow_box[j][1])
- the_boxes[i][2] = max(the_boxes[i][2], unknow_box[j][2])
- the_boxes[i][3] = max(the_boxes[i][3], unknow_box[j][3])
- unknow_box[j] = np.array([0, 0, 0, 0, 10])
- return 1
- # @njit
- def __find_down(the_boxes, unknow_box, i, j, find_range):
- for xs in range(int(unknow_box[j][0]), int(unknow_box[j][2])):
- for ys in range(int(unknow_box[j][3]), int(unknow_box[j][3] + find_range)):
- if the_boxes[i][0] < xs < the_boxes[i][2] and the_boxes[i][1] < ys < the_boxes[i][3]:
- the_boxes[i][0] = min(the_boxes[i][0], unknow_box[j][0])
- the_boxes[i][1] = min(the_boxes[i][1], unknow_box[j][1])
- the_boxes[i][2] = max(the_boxes[i][2], unknow_box[j][2])
- the_boxes[i][3] = max(the_boxes[i][3], unknow_box[j][3])
- unknow_box[j] = np.array([0, 0, 0, 0, 10])
- return ys - unknow_box[j][3]
- # @njit
- def __find_top(the_boxes, unknow_box, i, j, find_range):
- for xs in range(int(unknow_box[j][0]), int(unknow_box[j][2])):
- for ys in range(int(unknow_box[j][1]), int(unknow_box[j][1] - find_range), -1):
- if the_boxes[i][0] < xs < the_boxes[i][2] and the_boxes[i][1] < ys < the_boxes[i][3]:
- the_boxes[i][0] = min(the_boxes[i][0], unknow_box[j][0])
- the_boxes[i][1] = min(the_boxes[i][1], unknow_box[j][1])
- the_boxes[i][2] = max(the_boxes[i][2], unknow_box[j][2])
- the_boxes[i][3] = max(the_boxes[i][3], unknow_box[j][3])
- unknow_box[j] = np.array([0, 0, 0, 0, 10])
- return unknow_box[j][3] - ys
- # @njit
- def _find_down(boxes, y_max, i, j):
- for xs in range(boxes[i][0], boxes[i][2]):
- for ys in range(boxes[i][3], y_max):
- if boxes[j][0] < xs < boxes[j][2] and boxes[j][1] < ys < boxes[j][2]:
- boxes[i][0] = min(boxes[i][0], boxes[j][0])
- boxes[i][1] = min(boxes[i][1], boxes[j][1])
- boxes[i][2] = max(boxes[i][2], boxes[j][2])
- boxes[i][3] = max(boxes[i][3], boxes[j][3])
- return 1
- # 小图片向上粘贴
- for i in range(len(image_box)):
- for j in range(len(unknow_box)):
- __find_top(image_box, unknow_box, i, j, mix//2)
- # 小图片向下粘贴
- for i in range(len(image_box)):
- for j in range(len(unknow_box)):
- __find_down(image_box, unknow_box, i, j, mix//2)
- # 小图片向左粘贴
- for i in range(len(image_box)):
- for j in range(len(unknow_box)):
- __find_left(image_box, unknow_box, i, j, mix//2 )
- # 小图片向右粘贴
- for i in range(len(image_box)):
- for j in range(len(unknow_box)):
- __find_right(image_box, unknow_box, i, j, mix//2)
- # 消除在imagebox里的textbox
- # for i in range(len(tex_box)):
- # for j in range(len(image_box)):
- # __find_in(tex_box, image_box, i, j)
- for i in range(len(small_text)):
- for j in range(len(image_box)):
- if not all(small_text[i] == image_box[j]):
- __find_in(small_text, image_box, i, j)
- # textbox向右寻找
- for i in range(len(tex_box)):
- for j in range(i + 1, len(tex_box)):
- _find_right(tex_box, i, j, mix)
- text_box = []
- for i in tex_box:
- if 1 < (i[2] - i[0]) * (i[3] - i[1]) < min_xs :
- small_text.append(np.array(i))
- else:
- text_box.append(i)
- tex_box = pd.DataFrame(text_box)
- tex_box[4] = 1
- tex_box = tex_box.sort_values(by=0).astype(np.int).values
- if len(small_text) > 0:
- small_text = pd.DataFrame(small_text)
- small_text[4] = 3
- small_text = small_text.sort_values(by=0).astype(np.int).values
- # textbox向左合并小图
- for i in range(len(tex_box)):
- if (tex_box[i][2] - tex_box[i][0]) * (tex_box[i][3] - tex_box[i][1]) > 5 * min_xs:
- for j in range(len(unknow_box)):
- __find_left(tex_box, unknow_box, i, j, mix)
- for i in range(len(tex_box)):
- if (tex_box[i][2] - tex_box[i][0]) * (tex_box[i][3] - tex_box[i][1]) > 5 * min_xs:
- for j in range(len(small_text)):
- __find_left(tex_box, small_text, i, j, 5 * mix)
- # textbox向下寻找
- for i in range(len(tex_box)):
- for j in range(len(unknow_box)):
- __find_down(tex_box, unknow_box, i, j, mix // 3)
- for i in range(len(tex_box)):
- for j in range(len(small_text)):
- __find_down(tex_box, small_text, i, j, mix // 3)
- # textbox向上寻找
- for i in range(len(tex_box)):
- for j in range(len(unknow_box)):
- __find_top(tex_box, unknow_box, i, j, mix // 3)
- for i in range(len(tex_box)):
- for j in range(len(small_text)):
- __find_top(tex_box, small_text, i, j, mix // 3)
- # image_box_bk = deepcopy(image_box)
- # 消除内部imagebox
- for i in range(len(image_box)):
- for j in range(i+1,len(image_box)):
- if not all(image_box[i] == image_box[j]):
- __find_in(image_box, image_box, i, j, 0)
- # 消除内部textbox
- for i in range(len(tex_box)):
- for j in range(i+1,len(tex_box)):
- if not all(tex_box[i] == tex_box[j]):
- __find_in(tex_box, tex_box, i, j, -int(mix//5))
- # text_box_p = [i[:4] for i in tex_box]# + [i[:4] for i in small_text]
- # image_box_p = [i[:4] for i in image_box]
- #
- # return text_box_p, image_box_p
- #
- # for i in range(len(tex_box)):
- # for j in range(i+1,len(tex_box)):
- # __find_in(tex_box, tex_box, i, j, -int(mix//10))
- for i in range(len(small_text)):
- for j in range(len(tex_box)):
- if not all(small_text[i] == tex_box[j]):
- __find_in(small_text, tex_box, i, j, 0)
- for i in range(len(tex_box)):
- for j in range(len(image_box)):
- if not all(tex_box[i] == image_box[j]):
- __find_in(tex_box, image_box, i, j, 0)
- for i in range(len(small_text)):
- for j in range(len(image_box)):
- if not all(small_text[i] == image_box[j]):
- __find_in(small_text, image_box, i, j, 0)
- text_box_p = [i[:4] for i in tex_box if np.sum(i[:4])] + [i[:4] for i in small_text if np.sum(i[:4])]
- image_box_p = [i[:4] for i in image_box if np.sum(i[:4])]
- return text_box_p, image_box_p
|