washutil.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. import re
  2. # import sys
  3. from PointerNet import config
  4. from bs4 import BeautifulSoup
  5. from Utils.main_clear.sci_clear import non_data_latex_iter, non_data_latex_regexp
  6. logger = config.myLog(__name__, log_cate="tmp_clear",
  7. subject="clear_log").getlog()
  8. num2circle = {"1": "①", "2": "②", "3": "③", "4": "④",
  9. "5": "⑤", "6": "⑥", "7": "⑦", "8": "⑧",
  10. "9": "⑨", "10": "⑩", "11": "⑪", "12": "⑫",
  11. "13": "⑬", "14": "⑭", "15": "⑮", "16": "⑯",
  12. "17": "⑰", "18": "⑱", "19": "⑲", "20": "⑳"}
  13. def ltx_wash(ss):
  14. raw_ss = ss
  15. try:
  16. ss = non_data_latex_iter(ss) # 拿到字符串中的latex再转maple
  17. ss = re.sub(r'([a-zA-Z\d])\*([a-zA-Z\d])', r"\1\2", ss)
  18. except:
  19. try:
  20. ss = non_data_latex_regexp(ss) # 拿到字符串中的latex再转maple
  21. ss = re.sub(r'([a-zA-Z\d])\*([a-zA-Z\d])', r"\1\2", ss)
  22. except:
  23. pass
  24. if ss.replace("$", "").strip():
  25. ss = "${}$".format(ss)
  26. ss = re.sub(r'\$\s*\$(.+?)\$\s*\$', r"$\1$", ss.strip())
  27. ss = f"【{ss}##&##{raw_ss}】"
  28. else:
  29. ss = "【公式】"
  30. return ss
  31. def simpwash(html, paper_id, need_latex=0):
  32. """
  33. # data-latex="xxx", 引号里面不要再有",否则BeautifulSoup会掉内容
  34. """
  35. imgs = []
  36. formula_without_ltx = 0
  37. html = re.sub(r'data-latex="(.*?)(?<!\\)"(?=[\s/>])', lambda x: 'data-latex="{}"'.format(
  38. x.group(1).replace("\"", "'")), html, flags=re.S) # .replace("\n", "")
  39. html = html.replace(r'*-*', '').replace(r'\a*rg', 'arg').replace('<latex>', '$').replace('</latex>', '$')
  40. html = re.sub(r'<br/\s*>', "<br>", html)
  41. html = html.replace("&#39;", "'").replace("&nbsp;", " ").replace("\xa0", " ")
  42. html = re.sub(r'(<t[dr] [^<>]*?)style="[^<>]*?"', r"\1", html)
  43. html = re.sub(r'(<t[dr] [^<>]*?)valign="[^<>]*?"', r"\1", html)
  44. html = re.sub(r'(<t[dr] [^<>]*?)align="[^<>]*?"', r"\1", html)
  45. html = re.sub(r'(<t[dr] [^<>]*?)class="[^<>]*?"', r"\1", html)
  46. html = re.sub(r'(<t[dr] [^<>]*?)width="[^<>]*?"', r"\1", html)
  47. html = re.sub(r'(<t[dr] [^<>]*?)height="[^<>]*?"', r"\1", html)
  48. html = re.sub(r'<p style="[^<>]*?">\s*</p>', "", html)
  49. if re.search('</?(span|font|article|ul|ol|div)(\s*|\s+style=.*?"|\s+class=.*?")>|text\s*-\s*decoration: underline|border\s*-\s*bottom:'
  50. '|class="gsImgLatex| type="| style="| class="', str(html)) is None:
  51. # content = re.sub('</p\s*>', "\n", content).strip().replace("\n\n", "\n").replace("\n", "<br/>").
  52. s = re.sub('<p(\s*|\s+style=.*?")>', "<p>", html)
  53. s = re.sub('</p><p>', "<br>", s)
  54. s = re.sub('</p>|<p>', "<br>", s)
  55. # ltx处理
  56. def sub4(ss):
  57. if ss.group(2).replace("$", "").strip():
  58. new_ltx = ltx_wash(ss.group(2))
  59. if new_ltx != "【公式】":
  60. return new_ltx
  61. return f"【公式{ss.group(0)}】"
  62. s = re.sub(r' data-latex="(\\\\\[|\\\[)(.*?)(\\\]|\\\\\])"', r' data-latex="$\2$"', s)
  63. s = re.sub(r'<img src=((?!src).)+?data-latex="(\$?((?!["/]>).)+?\$?)".*? />', sub4, s, flags=re.S)
  64. else:
  65. # print("paper_id:::", paper_id)
  66. soup = BeautifulSoup(html, features="lxml")
  67. s = ''
  68. # print(soup.prettify())
  69. quan_begin_with_zero = 0
  70. all_parts = soup.prettify().split('\n') # 这里必须是\n
  71. for nn, i in enumerate(all_parts):
  72. # print(i)
  73. if i.strip().startswith('<img'):
  74. s2 = BeautifulSoup(i, features="lxml")
  75. if s2.img:
  76. s3 = s2.img.get('data-latex')
  77. # print(s3)
  78. if s3:
  79. # s += structured(s3.replace('%20', ' ').replace('%3C', '<').replace('%3E', '>'))
  80. if re.sub(r"^\\\[|\\\]$|\$|\s+", "", s3):
  81. s3 = re.sub(r"^\\\[(.*?)\\\]$", r"$\1$", s3)
  82. if re.match("\$.*?\$$", s3.strip()) is None:
  83. s3 = "${}$".format(s3)
  84. # print("latex:::", s3)
  85. s3 = ltx_wash(s3)
  86. if s3 == "【公式】":
  87. s3 = f"【公式{s2.img}】"
  88. # print("latex_washed:::", s3)
  89. s += s3.replace('%20', ' ').replace('%3C', '<').replace('%3E', '>').strip()
  90. else:
  91. if need_latex:
  92. formula_without_ltx = 1
  93. break
  94. s += f"【公式{s2.img}】"
  95. else:
  96. s3 = s2.img.get('src')
  97. if not s3:
  98. continue
  99. elif 'class="gsImgLatex mathType"' in i:
  100. if len(s3.split('?')) == 2:
  101. # http://tkimgs.zhixinhuixue.net/image/word/2021/11/11/1636638682578739.gif?%20-%20{e^{%20-%20x}}%20-%203x
  102. s3 = ltx_wash("${}$".format(s3.split('?')[-1]))
  103. if s3 == "【公式】":
  104. s3 = f"【公式{s2.img}】"
  105. else:
  106. if re.search('.gif("|$)', s3) is None:
  107. logger.info("【{}】特殊公式格式{}".format(paper_id, str(i)))
  108. if need_latex:
  109. formula_without_ltx = 1
  110. break
  111. s3 = f"【公式{s2.img}】"
  112. else:
  113. if 'data-type="math"' in i:
  114. if need_latex:
  115. formula_without_ltx = 1
  116. break
  117. s3 = f"【公式{s2.img}】"
  118. elif 'class="tiankong"' in i: # ①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳
  119. serial_num = int(s2.img.get('data-num'))
  120. if serial_num == 0:
  121. quan_begin_with_zero = 1
  122. if quan_begin_with_zero:
  123. serial_num = serial_num + 1
  124. if 1 <= serial_num <= 20:
  125. s3 = '__{}__'.format(num2circle[str(serial_num)])
  126. elif int(serial_num) + 1 > 20:
  127. s3 = '__({})__'.format(serial_num)
  128. else:
  129. s3 = '____'
  130. logger.info("【{}】特殊带圈符号:{}".format(paper_id, str(i)))
  131. else:
  132. s3 = s2.img.__str__()
  133. # imgs.append(str(s2.img))
  134. # if s2.img.get('height'):
  135. # img_h = s2.img.get('height')
  136. # else:
  137. # try:
  138. # img_h = Image.open(io.BytesIO(requests.get(s3).content)).size[1]
  139. # except:
  140. # logger.info("【{}】图片有问题:{}".format(subject, paper_id))
  141. # img_h = 0 # 默认不要
  142. # raw_img = re.findall('<img [^<>]*?src="'+s3+'"[^<>]*?/?>', html)
  143. # print(str(s2.img))
  144. # imgs.extend(raw_img)
  145. # img_h = str(img_h).replace("px", "").replace("pt", "").replace("in", "")
  146. # s3 = '【图片】'
  147. # if need_latex == 0 and float(img_h) <= 50:
  148. # s3 = ''
  149. s += s3
  150. elif i.strip().startswith("<table"):
  151. # if all_parts[nn+1].strip().startswith("<tbody"):
  152. # if re.match("<tbody|<tr", all_parts[nn + 1].strip()):
  153. # s += (i.strip() + all_parts[nn+1].strip())
  154. # s += ("<table>" + all_parts[nn + 1].strip())
  155. s += "<table>"
  156. elif re.match("</?t[rd][\s>]|</tbody>|</table>", i.strip()):
  157. if re.match("</", i.strip()):
  158. s += i.strip()
  159. else:
  160. s += i.strip() if re.search("[\s>]$", s) else " " + i.strip()
  161. elif i.strip().startswith('<span '):
  162. # print(666666666666, i)
  163. if "underline" in i.strip():
  164. s += "_______"
  165. elif i.strip().startswith('<'):
  166. if re.match("<br\s*/?>|</p>", i.strip()):
  167. s += "<br>"
  168. pass
  169. else:
  170. s += i.strip()
  171. # print(s)
  172. # print("****************************")
  173. s = re.sub(' +', " ", s) # \s匹配任何空白字符,包括空格、制表符、换页符、换行符等
  174. # s = re.sub('<br>', "\n", s)
  175. # s = re.sub(r'\\n\s*[;;]\s*\\n', ";\n", s)
  176. # s = re.sub(r'\\n+', "\n", s)
  177. # s = re.sub('\\n+', "\n", s)
  178. # s = re.sub('\n[\n\s]+', "\n", s)
  179. # s = re.sub(r'\n+(</t[dr]>)', r"\1", s)
  180. s = re.sub(r'<br>\s*[;;]\s*<br>', ";<br>", s)
  181. s = re.sub(r'(<br>)+', "<br>", s)
  182. s = re.sub('<br>(<br>|\s)+', "<br>", s)
  183. s = re.sub(r'(<br>)+(</t[dr]>)', r"\2", s)
  184. s = s.replace('#', '').replace("&#39;", "'").replace("&nbsp;", " ").replace("\xa0", " ")
  185. s = s.replace("&lt;", "<")
  186. s = s.replace("&gt;", ">")
  187. s = s.replace('%20', '').replace('%22', '"').replace('%40', '@')
  188. s = s.replace('%25', '%').replace('%26', '&').replace('%23', '#').replace('%28', '(').replace('%29', ')')
  189. s = s.replace('%2B', '+').replace('%2C', ',').replace('%2F', '/')
  190. s = s.replace('%3E', '>').replace('%3F', '?').replace('%5C', '\\').replace('%7C', '|')
  191. s = s.replace('%3C', '<').replace('%3D', '=').replace('%3A', ':').replace('%3B', ';')
  192. return s.strip(), imgs, formula_without_ltx
  193. def again_wash(item, paper_id):
  194. """
  195. 对试题str再次清洗
  196. """
  197. item = re.sub('</table>', '</table><br>', item)
  198. item = re.sub(r"【(<img .*?\"\s*/?>公式latex提取失败)】", r"【公式\1】", item)
  199. item = re.sub(r"(?<!公式)(<img .*?[\"']\s*/?>)", r"【图片\1】", item)
  200. item = re.sub(r"<span style=\"color: red\">(.*?)</span>", r"\1", item)
  201. item = re.sub(r'<([a-z]+) [a-z]+="[^<>]*?"\s*/?>', lambda x: "" if x.group(1) != "img" else x.group(0), item)
  202. # item_str = re.sub(r"<([a-z]+)>(.+?)</\1>", lambda x: x.group(2) if x.group(1) not in ["sub", "sup"] else x.group(0), item_str)
  203. item = re.sub(r'</?body>|</?head>|</?html>|<p>|</p>', '<br>', item)
  204. item = re.sub(r'<b\*?r\s*/?>', '<br>', item)
  205. item = re.sub(r'(<br>)+', "<br>", item)
  206. # item = re.sub(r'\n+', "\n", item)
  207. # item = re.sub(r'\\n+', "\n", item)
  208. item = re.sub("\s{3,}", " ", item)
  209. # 保留原始公式和图片
  210. text_with_imginfo = re.sub(r"【图片([^】]+?)】", r"\1", item)
  211. text_with_imginfo = re.sub(r"【([^】]+?)##&##([^】]+?)】", r"\2", text_with_imginfo)
  212. text_with_imginfo = re.sub(r"【公式([^】]+?)】", lambda x: f"【{x.group(1)}】"
  213. if "latex提取失败" in x.group(1) else x.group(1), text_with_imginfo)
  214. # 简化了公式、图片和表格
  215. simply_text = re.sub("【图片[^】]+?】", "【图片】", item)
  216. simply_text = re.sub(r"【([^】]+?)##&##([^】]+?)】", r"\1", simply_text)
  217. simply_text = re.sub("【公式[^】]+?】", "【公式】", simply_text)
  218. simply_text = re.sub(r'<sub>(.+?)</sub>', r'_{\1}', simply_text)
  219. simply_text = re.sub(r'<sup>(.+?)</sup>', r'^{\1}', simply_text)
  220. # 表格简化
  221. simply_text2 = re.sub(r'<sub>|</sub>|<td>\s*</td>', '', simply_text)
  222. simply_text2 = re.sub(r'<td .+?["\']>\s*</td>', '', simply_text2)
  223. simply_text2 = re.sub(r'</td>\s*<td( .+?["\'])?>', ' ', simply_text2)
  224. simply_text2 = re.sub(r'<tr .+?["\']>\s*</tr>|<table>|</?tbody>|<table .+?["\']>', '', simply_text2)
  225. simply_text2 = re.sub(r'<tr( .+?["\'])?>\s*<td( .+?["\'])?>', '<tr>', simply_text2)
  226. simply_text2 = re.sub('</td></tr>', '', simply_text2)
  227. simply_text2 = re.sub(" {3,}", " ", simply_text2)
  228. sents_with_imginfo = [i.strip() for i in text_with_imginfo.split("<br>") if i.strip()]
  229. simply_sents = [i.strip() for i in simply_text2.split("<br>") if i.strip()]
  230. if len(simply_sents) != len(sents_with_imginfo):
  231. simply_sents = [i.strip() for i in simply_text.split("<br>") if i.strip()]
  232. if len(simply_sents) != len(sents_with_imginfo):
  233. print("清洗有重大问题!!!!!")
  234. logger.info(f"【{paper_id}】清洗有重大问题")
  235. return sents_with_imginfo, simply_sents