html_again_parse.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. # -*- coding: utf-8 -*-
  2. import re
  3. from pprint import pprint
  4. from pyquery import PyQuery as pq
  5. #
  6. # pattern = re.compile(r"\[来源.*?\]|www\..*?com")
  7. #
  8. # filter_words = ["学科网", "高考资源网", "Ziyuanku.com", "WWW.ziyuanku.com",
  9. # "高☆考♂资♀源€网", "w.w.w.k.s.5.u.c.o.m", "本资料由《七彩教育网》www.7caiedu.cn 提供!",
  10. # "本资料来源于《七彩教育网》http://www.7caiedu.cn", "本资料由《七彩教育网》www.7caiedu.cn 提",
  11. # "高考试题来源:"]
  12. #
  13. #
  14. # def filter_word(txt_list):
  15. # new_txt_list = []
  16. # for word in txt_list:
  17. # if not word.strip():
  18. # continue
  19. # new_word = re.sub(pattern, "", word)
  20. # for keys in filter_words:
  21. # if keys in new_word:
  22. # new_word = new_word.replace(keys, "")
  23. # new_txt_list.append(new_word)
  24. # return new_txt_list
  25. def filter_data(x):
  26. if not str(x).replace(" ", "").strip():
  27. pass
  28. else:
  29. return str(x)
  30. def replace_k(con):
  31. # con = str(con).replace(" ", "+")
  32. # con = str(con).replace(" ", "+")
  33. con = re.sub(r'\s(?!(src="http|_src="http|class="tiankong"|data-num=))', "#+#", str(con))
  34. return pq(con, parser="html")
  35. def css_label_wash(content):
  36. # todo add 9-4
  37. """
  38. 清洗文本中带有的css标签
  39. :param content:
  40. :return:
  41. """
  42. # temp_con = re.sub('</?p(\s*|\s+style=.*?")?>', "", str(content))
  43. if re.search('</?(span|font|article|ul|ol|div)(\s*|\s+style=.*?")>|text\s*-\s*decoration: underline|border\s*-\s*bottom:', str(content)) is None:
  44. # content = re.sub('</p\s*>', "\n", content).strip().replace("\n\n", "\n").replace("\n", "<br/>").
  45. content = re.sub('<p(\s*|\s+style=.*?")>', "<p>", content)
  46. content = re.sub('</p><p>', "</p>\n<p>", content)
  47. return content
  48. else:
  49. content = re.sub(r'<font\s+style="color: red">', "", str(content))
  50. # content = re.sub(r'<span\s+style="color: red">', "", str(content))
  51. #20240704/红色标记先保留
  52. content = re.sub(r'<span\s+style="color: red">(.+?)</span>', r"【red##\1】", str(content))
  53. content = re.sub(r'<span\s+style="color: blue">', "", str(content))
  54. content = re.sub(r'<font\s+style="color: blue">', "", str(content))
  55. content = content.replace("</font >", "").replace("</font>", "")
  56. # content = content.replace("</span >", "").replace("</span>", "")
  57. content = content.replace('<p style="height: 0;">&nbsp;</p>', "\n").replace('<p><br/></p>', "\n")
  58. # content = content.replace("</p>", "</p>\n") # 2022-4-25
  59. content = re.sub(r"</p>\s*(?!\n)", "</p>\n", content)
  60. content = re.sub(r"<br\s*/?>", "<p>", content) # 2022/1/6
  61. # content = re.sub('<p( style=.*?")?>', "", content)
  62. # content = re.sub('<br\s*/?>', "\n", content)
  63. # parm = False
  64. # if "<article>" not in str(content):
  65. # parm = True
  66. subs2img = {}
  67. if re.search('<img.*? src="',content) or "</table>" in content:
  68. all_imgs = re.findall('<img.*? src=.*?[\s/"]>|<table.*?>.*?</table>',
  69. content, flags=re.S)
  70. for k, img in enumerate(all_imgs):
  71. content = content.replace(img, "&{}&".format(k))
  72. #表格里的公式的标签需要清洗20240704
  73. if "</table>" in img and "math-tex" in img:
  74. img = re.sub(r'<span class="math-tex">(.+?)</span>', r'\1', img)
  75. subs2img["&{}&".format(k)] = img
  76. content = re.sub(r"<(su[bp])>(.*?)</(su[bp])>", r"【\1】\2【/\3】", content)
  77. content = content.replace("&lt;", "【#lt;】")
  78. html = pq(content, parser="html")
  79. a = []
  80. if html.children():
  81. for line in html.children().items(): # <p>.*?</p>里面的内容可能会被过滤掉
  82. test = line.text()
  83. # 保留下划线及着重符标签 <span style="text-decoration: underline;">
  84. # 波浪线:<span style="text-decoration: underline wavy;">
  85. # pq会将多个空格换成一个
  86. if '<span style="text-decoration: underline' in str(line) or '<span class="dots"' in str(line) \
  87. or '<p style="text-indent:' in str(line) or '<p style="text-align:' in str(line):
  88. line = re.sub(r'<span style="text-decoration: underline(.*?">.+?)</span>', r"【1#\1##】", str(line))
  89. line = re.sub(r'<span class="dots">(.+?)</span>', r"【2#\1##】", str(line))
  90. line = re.sub(r'<(p style="text-(indent|align):.*?">.+?)</p>', r"【\1##3】", str(line))
  91. line = line.replace(" ", "【+】")
  92. line = pq(line)
  93. new_line = list(map(lambda x: str(x).replace("【1#", '<span style="text-decoration: underline')
  94. .replace("##】", "</span>").replace("【2#", '<span class="dots">')
  95. .replace("【p【+】style=", "<p style=").replace("##3】", "</p>").replace("【+】", " "),
  96. line.text().split("\n")))
  97. a.extend(new_line)
  98. elif str(line).startswith("<p") and line.text().strip():
  99. if '<img src="http://' in str(line) or "text-decoration: underline" in str(line) or "border-bottom:" in str(line) or "text - decoration: underline" in str(line):
  100. # a.append(line.html().replace("<br />", "\n").replace("<br/>", "\n").replace("<br>", "\n") + "\n")
  101. a.append(line.html())
  102. else:
  103. line = replace_k(line)
  104. if re.search(r"\n|<br\s*/?>", str(line), flags=re.S):
  105. line = re.sub(r"\n|<br\s*/?>", "#*#", str(line), flags=re.S)
  106. line = pq(line)
  107. new_line = list(map(lambda x: str(x).replace("#+#", " "), line.text().split("#*#")))
  108. a.extend(new_line)
  109. else:
  110. if line.text().strip():
  111. a.append(line.text().replace("#+#", " "))
  112. elif "<article>" in str(line) and "</article>" in str(line):
  113. line = re.sub(r'<p.*?>', "", str(line.html()))
  114. b = line.replace('</p>', ""). \
  115. replace("<br>", "\n"). \
  116. replace("<br/>", "\n"). \
  117. replace("<br />", "\n"). \
  118. replace('<p style="height: 0;">&nbsp;</p>', "\n"). \
  119. replace('<p style="height: 0;"> </p>', "\n")
  120. b_list = b.split("\n")
  121. # b_list = list(filter(lambda x: str(x), b_list))
  122. b_list = list(filter(filter_data, b_list))
  123. b_list = list(map(lambda x: str(x), b_list))
  124. a.extend(b_list)
  125. elif str(line).startswith("<ul"):
  126. a.append(line.text())
  127. elif line.attr("class") == "slave-datas":
  128. for j, ss in enumerate(line.children().items()):
  129. if ss(".read-list-title").text():
  130. a.append(str(j + 1) + "." + ss(".read-list-title").text())
  131. if ss(".read-list-opt").text():
  132. a.append(ss(".read-list-opt").text().replace("\n", "\t"))
  133. if str(ss).startswith("<ul"):
  134. a.append(ss.text())
  135. elif str(line).startswith("<table"):
  136. a.append(str(line))
  137. elif str(line).startswith("<ol"):
  138. for i, ss in enumerate(line.children().items()):
  139. a.append(str(i + 1) + "." + ss.text())
  140. else:
  141. # print('test:',line.text()) # 自动去掉了图片
  142. if line.text().strip():
  143. line = replace_k(line)
  144. # a.append(line.text().replace("+", "\xa0") + "\n")
  145. a.append(line.text().replace("#+#", " "))
  146. else:
  147. # if html.text().strip(): 把换行\n 都去掉了
  148. # a.append(html.text())
  149. # a.append(str(html)) # 会自动带上</?p> ;&变为&amp;
  150. a.append(content.strip())
  151. new_a = "\n".join(list(map(lambda x: str(x).strip(), a)))
  152. new_a = re.sub("(\n\s*)+", "\n", new_a)
  153. # print("newa:::", new_a)
  154. if subs2img:
  155. new_a = re.sub(r"|".join(subs2img.keys()), lambda x: subs2img[x.group()], new_a)
  156. new_a = "<p>" + new_a.replace("\n\n", "\n").replace("\n", "</p>\n<p>") + "</p>" #第2个replace:“\n”前加r
  157. # for sb, img in subs2img.items(): # 2021
  158. # new_a = new_a.replace(sb, img)
  159. # if parm:
  160. # new_a[0] = "\xa0" * 4 + new_a[0]
  161. new_a = re.sub(r"【(/?su[bp])】", r"<\1>", new_a).replace("【#lt;】", "&lt;")
  162. new_a = re.sub(r"【red##(.*?)】", r'<span style="color: red">\1</span>', new_a)
  163. return new_a
  164. if __name__ == '__main__':
  165. cons = r'''
  166. <div>(1)求货物和重物的质量关系;</div>
  167. (2)要使货物运送到B端,若采用在重物下方挖坑的方法,求至少挖多深的坑;<br>(3)要使货物运送到B端,若采用配重落地时传送带立刻顺时针转动的方法(启动时间可<br>忽略),求传送带速度大小的范围和货物从A端传送到B端所用时间的范围(结果保留三位有<br>效数字)。<br><br><br><br>物理参考答案<br>
  168. <div>1.C</div>
  169. <div>【解析】绝大多数α粒子沿直线穿过,偏转角很</div>
  170. <div>小,说明原子核很小;A项错误;少数α粒子穿过金箱</div>
  171. <div>后发生较犬角度的偏转是由于少数α粒子穿过金箱</div>
  172. <div>时距离金原子核较近,受到的库仑斥力较大,B项错</div>
  173. <div>误;极少数α粒子被弹回,说明原子核是一个体积小、</div>
  174. '''
  175. cons1 = '''
  176. 9 . 中国古代的政治权力由“传贤”转变为“传子”,“家天下”制度开始形成于<table name=\"optionsTable\" style=\"width:100%;table-layout:fixed;\" cols=\"4\"><tr><td>A.夏朝</td><td>B.商朝</td><td>C.周朝</td><td>D.秦朝</td></tr></table>
  177. '''
  178. # pprint(cons)
  179. # print(again_parse(cons))
  180. # print(again_parse(cons))
  181. # print(list(map(lambda x: str(x).replace(" ", " "), again_parse(cons))))
  182. # con1 = r'<p>解:A.研究跨栏动作时,刘翔的大小和形状不能忽略,不能看作质点,故A错误;<br/>B.选取不同的参考系,物体的运动状态是不相同的,故B错误;<br/>C.出租车收费是按路程收费的,故C错误;<br/>D.第<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553930794225.png" data-latex="${4 \rm{s} }$" width="13",height="11" />是指<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553931930702.png" data-latex="${1 \rm{s} }$" width="12",height="11" />的时间,是指从<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553930220437.png" data-latex="${3 \rm{s} }$" width="13",height="11" />末到<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553930794225.png" data-latex="${4 \rm{s} }$" width="13",height="11" />末这一段时间,故D正确;<br/>故选:D.</p>'
  183. path2 = r"F:\zwj\Text_Structure\accept_files\667d0bec1f8a0743e2aabc78_2.html"
  184. html = open(path2, "r", encoding="utf-8").read()
  185. cons = css_label_wash(html)
  186. with open(r"F:\zwj\Text_Structure\accept_files\temp.txt", "w",encoding='utf-8') as f:
  187. f.write(cons)
  188. print(cons)