Near.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. # -*- coding:utf-8 -*-
  2. import numpy as np
  3. import pandas as pd
  4. from copy import deepcopy
  5. import numba as nb
  6. @nb.njit
  7. def _find_right(boxes, i, j, find_range):
  8. for xs in range(int(boxes[i][2]), int(boxes[i][2] + find_range)):
  9. for ys in range(boxes[i][1] + 1, boxes[i][3] - 1):
  10. if boxes[j][0] < xs < boxes[j][2] and boxes[j][1] < ys < boxes[j][3]:
  11. boxes[i][0] = min(boxes[i][0], boxes[j][0])
  12. boxes[i][1] = min(boxes[i][1], boxes[j][1])
  13. boxes[i][2] = max(boxes[i][2], boxes[j][2])
  14. boxes[i][3] = max(boxes[i][3], boxes[j][3])
  15. boxes[j] = np.array([0, 0, 0, 0, 10])
  16. return xs - boxes[i][2]
  17. @nb.njit
  18. def _find_down(boxes, y_max, i, j):
  19. for xs in range(boxes[i][0], boxes[i][2]):
  20. for ys in range(boxes[i][3], y_max):
  21. if boxes[j][0] < xs < boxes[j][2] and boxes[j][1] < ys < boxes[j][2]:
  22. boxes[i][0] = min(boxes[i][0], boxes[j][0])
  23. boxes[i][1] = min(boxes[i][1], boxes[j][1])
  24. boxes[i][2] = max(boxes[i][2], boxes[j][2])
  25. boxes[i][3] = max(boxes[i][3], boxes[j][3])
  26. return 1
  27. @nb.njit
  28. def find_in(in_box, out_box, i, j, find_range=0):
  29. # if all(in_box[i] == out_box[j] ):
  30. #
  31. # return
  32. # x_min, y_min, x_max, y_max = 0, 1, 2, 3
  33. # if in_box[i][x_max] <= out_box[j][x_max] and in_box[i][x_min] >= out_box[j][x_min]:
  34. # overlap_w = in_box[i]
  35. # elif in_box[i][x_max] >= out_box[j][x_max] and in_box[i][x_min] >= out_box[j][x_min]:
  36. # overlap_w = in_box[i]
  37. #
  38. #
  39. # if in_box[i][x_max] <= out_box[j][x_max] and \
  40. # in_box[i][y_max] <= out_box[j][y_max] and \
  41. # in_box[i][x_min] > out_box[j][x_min] and \
  42. # in_box[i][y_min] < out_box[j][y_min]+11:
  43. # if find_range==888:
  44. # print('********************')
  45. # print(in_box[i])
  46. # out_box[j][0] = min(out_box[j][0], in_box[i][0])
  47. # out_box[j][1] = min(out_box[j][1], in_box[i][1])
  48. # out_box[j][2] = max(out_box[j][2], in_box[i][2])
  49. # out_box[j][3] = max(out_box[j][3], in_box[i][3])
  50. # in_box[i] = np.array([0, 0, 0, 0, 10])
  51. # return 1
  52. for xs in range(in_box[i][0] - find_range, in_box[i][2] + find_range):
  53. for ys in range(in_box[i][1] - find_range, in_box[i][3] + find_range):
  54. if out_box[j][0] <= xs <= out_box[j][2] and out_box[j][1] <= ys <= out_box[j][3]:
  55. out_box[j][0] = min(out_box[j][0], in_box[i][0])
  56. out_box[j][1] = min(out_box[j][1], in_box[i][1])
  57. out_box[j][2] = max(out_box[j][2], in_box[i][2])
  58. out_box[j][3] = max(out_box[j][3], in_box[i][3])
  59. in_box[i] = np.array([0, 0, 0, 0, 10])
  60. return 1
  61. @nb.njit
  62. def find_right(the_boxes, unknow_box, i, j, find_range):
  63. for xs in range(the_boxes[i][2], the_boxes[i][2] + int(find_range)):
  64. for ys in range(the_boxes[i][1], the_boxes[i][3]):
  65. if unknow_box[j][0] < xs < unknow_box[j][2] and unknow_box[j][1] < ys < unknow_box[j][3]:
  66. the_boxes[i][0] = min(the_boxes[i][0], unknow_box[j][0])
  67. the_boxes[i][1] = min(the_boxes[i][1], unknow_box[j][1])
  68. the_boxes[i][2] = max(the_boxes[i][2], unknow_box[j][2])
  69. the_boxes[i][3] = max(the_boxes[i][3], unknow_box[j][3])
  70. unknow_box[j] = np.array([0, 0, 0, 0, 10])
  71. return xs - the_boxes[i][2]
  72. @nb.njit
  73. def find_left(the_boxes, unknow_box, i, j, find_range):
  74. for xs in range(the_boxes[i][0], max(int(the_boxes[i][0] - find_range), 0), -1):
  75. for ys in range(the_boxes[i][1], the_boxes[i][3]):
  76. if unknow_box[j][0] < xs < unknow_box[j][2] and unknow_box[j][1] < ys < unknow_box[j][3]:
  77. the_boxes[i][0] = min(the_boxes[i][0], unknow_box[j][0])
  78. the_boxes[i][1] = min(the_boxes[i][1], unknow_box[j][1])
  79. the_boxes[i][2] = max(the_boxes[i][2], unknow_box[j][2])
  80. the_boxes[i][3] = max(the_boxes[i][3], unknow_box[j][3])
  81. unknow_box[j] = np.array([0, 0, 0, 0, 10])
  82. return 1
  83. @nb.njit
  84. def find_down(the_boxes, unknow_box, i, j, find_range):
  85. for xs in range(int(unknow_box[j][0]), int(unknow_box[j][2])):
  86. for ys in range(int(unknow_box[j][3]), int(unknow_box[j][3] + find_range)):
  87. if the_boxes[i][0] < xs < the_boxes[i][2] and the_boxes[i][1] < ys < the_boxes[i][3]:
  88. the_boxes[i][0] = min(the_boxes[i][0], unknow_box[j][0])
  89. the_boxes[i][1] = min(the_boxes[i][1], unknow_box[j][1])
  90. the_boxes[i][2] = max(the_boxes[i][2], unknow_box[j][2])
  91. the_boxes[i][3] = max(the_boxes[i][3], unknow_box[j][3])
  92. unknow_box[j] = np.array([0, 0, 0, 0, 10])
  93. return ys - unknow_box[j][3]
  94. @nb.njit
  95. def find_top(the_boxes, unknow_box, i, j, find_range):
  96. for xs in range(int(unknow_box[j][0]), int(unknow_box[j][2])):
  97. for ys in range(int(unknow_box[j][1]), int(unknow_box[j][1] - find_range), -1):
  98. if the_boxes[i][0] < xs < the_boxes[i][2] and the_boxes[i][1] < ys < the_boxes[i][3]:
  99. the_boxes[i][0] = min(the_boxes[i][0], unknow_box[j][0])
  100. the_boxes[i][1] = min(the_boxes[i][1], unknow_box[j][1])
  101. the_boxes[i][2] = max(the_boxes[i][2], unknow_box[j][2])
  102. the_boxes[i][3] = max(the_boxes[i][3], unknow_box[j][3])
  103. unknow_box[j] = np.array([0, 0, 0, 0, 10])
  104. return unknow_box[j][3] - ys
  105. def neighbor_change(tex_box, im_box):
  106. """
  107. 后期处理 临近合并
  108. :param tex_box: 文字区域
  109. :param im_box: 未知区域
  110. :return:
  111. """
  112. # return tex_box, im_box
  113. text_box, image_box, unknow_box, small_text = [], [], [], []
  114. tex_box_df = pd.DataFrame(tex_box)
  115. height = tex_box_df[3] - tex_box_df[1]
  116. mix = height.median()
  117. min_xs = 1.2 * mix ** 2
  118. img_w = int(tex_box_df[2].max())
  119. for i in tex_box:
  120. if (i[2] - i[0]) * (i[3] - i[1]) < min_xs and False:
  121. small_text.append(np.array(i))
  122. unknow_box.append(np.array(i))
  123. else:
  124. text_box.append(i)
  125. # 分开大图和小图
  126. for i in im_box:
  127. if (i[2] - i[0]) * (i[3] - i[1]) < min_xs:
  128. unknow_box.append(i)
  129. else:
  130. image_box.append(i)
  131. if len(image_box):
  132. image_box = pd.DataFrame(image_box)
  133. image_box[4] = 1
  134. image_box = image_box.sort_values(by=0).astype(np.int).values
  135. # print(unknow_box)
  136. if len(unknow_box) > 0:
  137. unknow_box = pd.DataFrame(unknow_box)
  138. unknow_box[4] = 2
  139. unknow_box = unknow_box.sort_values(by=0).astype(np.int).values
  140. tex_box = pd.DataFrame(text_box)
  141. tex_box[4] = 1
  142. tex_box = tex_box.sort_values(by=0).astype(np.int).values
  143. # 小图片向上粘贴
  144. for i in range(len(image_box)):
  145. for j in range(len(unknow_box)):
  146. find_top(image_box, unknow_box, i, j, mix // 2)
  147. unknow_box = np.array([i for i in unknow_box if i[0]])
  148. # 小图片向下粘贴
  149. for i in range(len(image_box)):
  150. for j in range(len(unknow_box)):
  151. find_down(image_box, unknow_box, i, j, mix // 2)
  152. unknow_box = np.array([i for i in unknow_box if i[0]])
  153. # 小图片向左粘贴
  154. for i in range(len(image_box)):
  155. for j in range(len(unknow_box)):
  156. find_left(image_box, unknow_box, i, j, mix // 2)
  157. unknow_box = np.array([i for i in unknow_box if i[0]])
  158. # 小图片向右粘贴
  159. for i in range(len(image_box)):
  160. for j in range(len(unknow_box)):
  161. find_right(image_box, unknow_box, i, j, mix // 2)
  162. unknow_box = np.array([i for i in unknow_box if i[0]])
  163. # 消除在imagebox里的textbox
  164. # for i in range(len(tex_box)):
  165. # for j in range(len(image_box)):
  166. # __find_in(tex_box, image_box, i, j)
  167. for i in range(len(small_text)):
  168. for j in range(len(image_box)):
  169. if not all(small_text[i] == image_box[j]):
  170. find_in(small_text, image_box, i, j)
  171. # textbox向右寻找
  172. for i in range(len(tex_box)):
  173. for j in range(i + 1, len(tex_box)):
  174. _find_right(tex_box, i, j, mix)
  175. tex_box = np.array([i for i in tex_box if i[0]])
  176. text_box = []
  177. for i in tex_box:
  178. if 1 < (i[2] - i[0]) * (i[3] - i[1]) < min_xs:
  179. small_text.append(np.array(i))
  180. else:
  181. text_box.append(i)
  182. small_text = np.array([i for i in small_text if i[0]])
  183. tex_box = pd.DataFrame(text_box)
  184. tex_box[4] = 1
  185. tex_box = tex_box.sort_values(by=0).astype(np.int).values
  186. if len(small_text) > 0:
  187. small_text = pd.DataFrame(small_text)
  188. small_text[4] = 3
  189. small_text = small_text.sort_values(by=0).astype(np.int).values
  190. # textbox向左合并小图
  191. for i in range(len(tex_box)):
  192. if (tex_box[i][2] - tex_box[i][0]) * (tex_box[i][3] - tex_box[i][1]) > 5 * min_xs:
  193. for j in range(len(unknow_box)):
  194. find_left(tex_box, unknow_box, i, j, mix)
  195. for i in range(len(tex_box)):
  196. if (tex_box[i][2] - tex_box[i][0]) * (tex_box[i][3] - tex_box[i][1]) > 5 * min_xs:
  197. for j in range(len(small_text)):
  198. find_left(tex_box, small_text, i, j, 5 * mix)
  199. # textbox向下寻找
  200. for i in range(len(tex_box)):
  201. for j in range(len(unknow_box)):
  202. find_down(tex_box, unknow_box, i, j, mix // 3)
  203. unknow_box = np.array([i for i in unknow_box if i[0]])
  204. for i in range(len(tex_box)):
  205. for j in range(len(small_text)):
  206. find_down(tex_box, small_text, i, j, mix // 3)
  207. small_text = np.array([i for i in small_text if i[0]])
  208. # textbox向上寻找
  209. for i in range(len(tex_box)):
  210. for j in range(len(unknow_box)):
  211. find_top(tex_box, unknow_box, i, j, mix // 3)
  212. for i in range(len(tex_box)):
  213. for j in range(len(small_text)):
  214. find_top(tex_box, small_text, i, j, mix // 3)
  215. image_box = np.array([i for i in image_box if i[0]])
  216. tex_box = np.array([i for i in tex_box if i[0]])
  217. # image_box_bk = deepcopy(image_box)
  218. # 消除内部imagebox
  219. for i in range(len(image_box)):
  220. for j in range(i + 1, len(image_box)):
  221. if not all(image_box[i] == image_box[j]):
  222. find_in(image_box, image_box, i, j, 0)
  223. # 消除内部textbox
  224. for i in range(len(tex_box)):
  225. for j in range(i + 1, len(tex_box)):
  226. if i != j:
  227. find_in(tex_box, tex_box, i, j, -int(mix // 5))
  228. # text_box_p = [i[:4] for i in tex_box]# + [i[:4] for i in small_text]
  229. # image_box_p = [i[:4] for i in image_box]
  230. #
  231. # return text_box_p, image_box_p
  232. #
  233. # for i in range(len(tex_box)):
  234. # for j in range(i+1,len(tex_box)):
  235. # __find_in(tex_box, tex_box, i, j, -int(mix//10))
  236. for i in range(len(small_text)):
  237. for j in range(len(tex_box)):
  238. if not all(small_text[i] == tex_box[j]):
  239. find_in(small_text, tex_box, i, j, 0)
  240. for i in range(len(tex_box)):
  241. for j in range(len(image_box)):
  242. if not all(tex_box[i] == image_box[j]):
  243. find_in(tex_box, image_box, i, j, 0)
  244. for i in range(len(small_text)):
  245. for j in range(len(image_box)):
  246. if not all(small_text[i] == image_box[j]):
  247. find_in(small_text, image_box, i, j, 0)
  248. text_box_p = [i[:4] for i in tex_box if np.sum(i[:4])] + [i[:4] for i in small_text if np.sum(i[:4])]
  249. image_box_p = [i[:4] for i in image_box if np.sum(i[:4])]
  250. return text_box_p, image_box_p
  251. class Neighbor:
  252. def __init__(self, tex_box, im_box):
  253. self.text_box, self.image_box, self.unknow_box = [], [], []
  254. tex_box_df = pd.DataFrame(tex_box)
  255. height = tex_box_df[3] - tex_box_df[1]
  256. mix = height.median()
  257. min_xs = mix ** 2
  258. for i in tex_box:
  259. if (i[2] - i[0]) * (i[3] - i[1]) < min_xs / 2000:
  260. self.unknow_box.append(np.array(i))
  261. else:
  262. self.text_box.append(i)
  263. for i in im_box:
  264. if (i[2] - i[0]) * (i[3] - i[1]) < min_xs:
  265. self.unknow_box.append(i)
  266. else:
  267. self.image_box.append(i)
  268. if len(self.image_box):
  269. image_box = pd.DataFrame(self.image_box)
  270. image_box[4] = 1
  271. self.image_box = image_box.sort_values(by=0).astype(np.int).values
  272. if len(self.unknow_box) > 0:
  273. unknow_box = pd.DataFrame(self.unknow_box)
  274. unknow_box[4] = 2
  275. self.unknow_box = unknow_box.sort_values(by=0).astype(np.int).values
  276. if len(self.text_box) > 0:
  277. tex_box = pd.DataFrame(self.text_box)
  278. tex_box[4] = 1
  279. self.tex_box = tex_box.sort_values(by=0).astype(np.int).values