html_again_parse.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. # -*- coding: utf-8 -*-
  2. import re
  3. from pprint import pprint
  4. from pyquery import PyQuery as pq
  5. #
  6. # pattern = re.compile(r"\[来源.*?\]|www\..*?com")
  7. #
  8. # filter_words = ["学科网", "高考资源网", "Ziyuanku.com", "WWW.ziyuanku.com",
  9. # "高☆考♂资♀源€网", "w.w.w.k.s.5.u.c.o.m", "本资料由《七彩教育网》www.7caiedu.cn 提供!",
  10. # "本资料来源于《七彩教育网》http://www.7caiedu.cn", "本资料由《七彩教育网》www.7caiedu.cn 提",
  11. # "高考试题来源:"]
  12. #
  13. #
  14. # def filter_word(txt_list):
  15. # new_txt_list = []
  16. # for word in txt_list:
  17. # if not word.strip():
  18. # continue
  19. # new_word = re.sub(pattern, "", word)
  20. # for keys in filter_words:
  21. # if keys in new_word:
  22. # new_word = new_word.replace(keys, "")
  23. # new_txt_list.append(new_word)
  24. # return new_txt_list
  25. def filter_data(x):
  26. if not str(x).replace(" ", "").strip():
  27. pass
  28. else:
  29. return str(x)
  30. def replace_k(con):
  31. # con = str(con).replace(" ", "+")
  32. # con = str(con).replace(" ", "+")
  33. con = re.sub(r'\s(?!(src="http|_src="http|class="tiankong"|data-num=))', "#+#", str(con))
  34. return pq(con, parser="html")
  35. def css_label_wash(content):
  36. # todo add 9-4
  37. """
  38. 清洗文本中带有的css标签
  39. :param content:
  40. :return:
  41. """
  42. # temp_con = re.sub('</?p(\s*|\s+style=.*?")?>', "", str(content))
  43. if re.search('</?(span|font|article|ul|ol|div)(\s*|\s+style=.*?")>|text\s*-\s*decoration: underline|border\s*-\s*bottom:', str(content)) is None:
  44. # content = re.sub('</p\s*>', "\n", content).strip().replace("\n\n", "\n").replace("\n", "<br/>")
  45. content = re.sub('<p(\s*|\s+style=.*?")>', "<p>", content)
  46. content = re.sub('</p><p>', "</p>\n<p>", content)
  47. return content
  48. else:
  49. content = re.sub(r'<font\s+style="color: red">', "", str(content))
  50. content = re.sub(r'<span\s+style="color: red">', "", str(content))
  51. content = re.sub(r'<span\s+style="color: blue">', "", str(content))
  52. content = re.sub(r'<font\s+style="color: blue">', "", str(content))
  53. content = content.replace("</font >", "").replace("</font>", "")
  54. content = content.replace("</span >", "").replace("</span>", "")
  55. content = content.replace('<p style="height: 0;">&nbsp;</p>', "\n").replace('<p><br/></p>', "\n")
  56. content = content.replace("</p>", "\n")
  57. # content = re.sub('<p( style=.*?")?>', "", content)
  58. # content = re.sub('<br\s*/?>', "\n", content)
  59. # parm = False
  60. # if "<article>" not in str(content):
  61. # parm = True
  62. subs2img = {}
  63. if re.search('<img.*? src="',content) or "<table/>" in content:
  64. all_imgs = re.findall('<img.*? src=.*?[\s/"]>|<table.*?>.*?</table>',
  65. content, flags=re.S)
  66. for k, img in enumerate(all_imgs):
  67. content = content.replace(img, "&{}&".format(k))
  68. subs2img["&{}&".format(k)] = img
  69. content = re.sub(r"<(su[bp])>(.*?)</(su[bp])>", r"【\1】\2【/\3】", content)
  70. html = pq(content, parser="html")
  71. a = []
  72. if html.children():
  73. for line in html.children().items(): # <p>.*?</p>里面的内容可能会被过滤掉
  74. test = line.text()
  75. if str(line).startswith("<p") and line.text().strip():
  76. if '<img src="http://' in str(line) or "text-decoration: underline" in str(line) or "border-bottom:" in str(line) or "text - decoration: underline" in str(line):
  77. # a.append(line.html().replace("<br />", "\n").replace("<br/>", "\n").replace("<br>", "\n") + "\n")
  78. a.append(line.html())
  79. else:
  80. line = replace_k(line)
  81. if "<br>" in str(line) or "<br/>" in str(line) or "<br />" in str(line):
  82. line = str(line).replace("<br/>", "###").replace("<br>", "###").replace("<br />", "###")
  83. line = pq(line)
  84. new_line = list(map(lambda x: str(x).replace("#+#", " "), line.text().split("###")))
  85. a.extend(new_line)
  86. else:
  87. if line.text().strip():
  88. a.append(line.text().replace("#+#", " "))
  89. elif "<article>" in str(line) and "</article>" in str(line):
  90. line = re.sub(r'<p.*?>', "", str(line.html()))
  91. b = line.replace('</p>', ""). \
  92. replace("<br>", "\n"). \
  93. replace("<br/>", "\n"). \
  94. replace("<br />", "\n"). \
  95. replace('<p style="height: 0;">&nbsp;</p>', "\n"). \
  96. replace('<p style="height: 0;"> </p>', "\n")
  97. b_list = b.split("\n")
  98. # b_list = list(filter(lambda x: str(x), b_list))
  99. b_list = list(filter(filter_data, b_list))
  100. b_list = list(map(lambda x: str(x), b_list))
  101. a.extend(b_list)
  102. elif str(line).startswith("<ul"):
  103. a.append(line.text())
  104. elif line.attr("class") == "slave-datas":
  105. for j, ss in enumerate(line.children().items()):
  106. if ss(".read-list-title").text():
  107. a.append(str(j + 1) + "." + ss(".read-list-title").text())
  108. if ss(".read-list-opt").text():
  109. a.append(ss(".read-list-opt").text().replace("\n", "\t"))
  110. if str(ss).startswith("<ul"):
  111. a.append(ss.text())
  112. elif str(line).startswith("<table"):
  113. a.append(str(line))
  114. elif str(line).startswith("<ol"):
  115. for i, ss in enumerate(line.children().items()):
  116. a.append(str(i + 1) + "." + ss.text())
  117. else:
  118. # print('test:',line.text()) # 自动去掉了图片
  119. if line.text().strip():
  120. line = replace_k(line)
  121. # a.append(line.text().replace("+", "\xa0") + "\n")
  122. a.append(line.text().replace("#+#", " "))
  123. else:
  124. # if html.text().strip(): 把换行\n 都去掉了
  125. # a.append(html.text())
  126. # a.append(str(html)) # 会自动带上</?p> ;&变为&amp;
  127. a.append(content.strip())
  128. new_a = "\n".join(list(filter(lambda x: x.strip(), a)))
  129. if subs2img:
  130. new_a = re.sub("|".join(subs2img.keys()), lambda x: subs2img[x.group()], new_a)
  131. new_a = "<p>" + new_a.replace("\n\n", "\n").replace("\n", "</p>\n<p>") + "</p>"
  132. # for sb, img in subs2img.items(): # 2021
  133. # new_a = new_a.replace(sb, img)
  134. # if parm:
  135. # new_a[0] = "\xa0" * 4 + new_a[0]
  136. new_a = re.sub(r"【(/?su[bp])】", r"<\1>", new_a)
  137. return new_a
  138. if __name__ == '__main__':
  139. cons = r'''
  140. <div class="stem-wraper" data-v-0b6ea9b4="">
  141. <article>
  142. <p>Nancycarrots are good for eyes.Eat some,please.</p>
  143. </article>
  144. </div>
  145. <div class="topic-analysis" data-v-0b6ea9b4="">
  146. <ul class="topic-analysis-items" data-v-0b6ea9b4="">
  147. <li class="topic-analysis-content" data-v-0b6ea9b4=""><span class="analysis-prefix" data-v-0b6ea9b4="">【答案】</span><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAOQAAABOCAYAAAAw0LoFAAAJY0lEQVR4Ae2dz2sVVxTH+z+5ciF0U7HiRkTciOLOhZuuBVdSshYLrkKDxlU3iqUEqdKGSFMJqU1TaKshJmpNGpvGNs0PSU755vX7cuZ6Z97MvJD7XuZ7Ybi/77n3O+cz581L4H1g+5Smp6ft2LFjdujQIbt9+3au1c3NTbt69erOOIzldfjwYbt3755tb2/nzr127Vp7/MTERHvc5OSknTlzpt3HNS9evGizs7PtcSisra3Z5cuXd8aePn3arl+/bjdu3Mi90I9xWBPzMP+gJq/N+fPnbXl5ufZRZ2Zm7MSJE3uiW57P4J4cPXrURkZGon7z7NkzO3nyZNsvzp49aw8fPrSNjY3ouXDeoaEhO378eHvOlStXbHV1dWc8fPPOnTt26tQpm5qaaq+BNuzlyJEjNjAwEPWlCxcu7Iz5oD1rHwpbW1v29OlTW1hYKLQ2Nja2s+lbt27Zo0eP7PHjx7a0tFQ4B50PHjxoHzYEDf1wKIiKCzcxltB+9+7dnXWQ543j3KrjOa8fc3/W4eHhtiPWOcvi4qINDg6W1rmTDfjV+Ph45pqfny+8fwBodHR050GP/RQ97L19+PHLly93QH/z5o3vsvX1dVtZWcm03b9/386dO2eXLl2yubm5TB8r9N19BZLGlUsBKRBXQEDGdVGrFEiigIBMIruMSoG4AgIyrotapUASBQRkEtllVArEFRCQcV3UKgWSKCAgk8guo1IgroCAjOuiVimQRAEBmUR2GZUCcQUEZFwXtUqBJAoIyCSyy6gUiCsgIOO6qFUKJFFAQCaRXUalQFwBARnXRa1SIIkCAjKJ7DIqBeIKCMi4LmqVAkkUEJBJZJdRKRBXQEDGdVGrFEiigIBMIruMSoG4AgIyrotapUASBQRkEtllVArEFRCQcV3UKgWSKCAgk8guo1IgroCAjOuiVimQRAEBmUR2GZUCcQUEZFwXtUqBJAoIyCSyy6gUiCsgIOO6qFUKJFFAQCaRXUalQFwBARnXRa1SIIkCAjKJ7DIqBeIKCMi4LmqVAkkU6CsgB741+2jI7MNBXWU0gFbQrGoae25284nZ55O6ymgAraDZXqS+ARKO9fFNgVgGRD8GmlWBEo41/KNALAOiHwPN9gLKvgFSkbH+wwjalU2KjPUfRtCu29Q3QPqnvsrV4SzrKP6pr3J1OMvqnDdOQDbkfTTPAcJ2QVgdQq9ZqGfVuoAUkBmf8c6lcnU4M2LWqAhIAZlxG0FYHUKvWUbMGhUBKSAzbuOdS+XqcGbErFERkAIy4zaCsDqEXrOMmDUqAlJAZtzGO5fK1eHMiFmj0igghybNfl1q/clg4pUZLvwJ5ctfzGaX3/9Twidfmf2xmq/q6qbZp9/szkMZ48O2v9bMYDv25xrsB315tkIbsTXKtOWfItuzFxD+8LvZ0r8tZ3711gwX1sVZl9fed/KR38xwzry0uWU2+nx3HsoYzzbkGOPT1rYZ9gG74Xh/Rtr2431/1bLfQ52ygOwAJED1gNH5AVDYB7ABOcYXORjGYR2MwxpYCxcc14Mbs0H7VfOyzlHVAWPj6wAJUAmYXxPAhH3QiYmw+3Eov92IA0kAOR85HhTeZjdlv26dcmOAhOMzARbchFiE9NESQJSNkBj700ILLIAWRkoChH3QBveDHFEUV5iw19gDgeuVzcN18+rdOCPmAh4mRC1ESkLjI6QvxyDhGsh9hCTsHkDYBcywjXbf5/eDtRb+ycLa7XnD+X7fdcqNARKOG35kpWAr661IBVAAEuDC+KIIFfZhLi+sUSZCwil9RCwLV51xPGunPHSwOnVCg7k+mq2/a0GDc0MfgIMxHqDQXtgXAobzwAbamaMNda6F/cA2oMV6PnpyzF7lnfTt1N9oIH2EhFCIXnR2H1E7iQgAX//dGoXyZ+P5EZLr8+GAPeACzHnJPyQ4v2qet3bYvheOGQLpIyTseVhigIV7Yh0QT71uwQyoAdbMn61ev2Z4Bj4U8J7482L+w9I/JMI1yta517p544B8t7V7Qz2QHkY6u3/HQxug4RxESNxoRjgfMTEPHz/zoiQh9DcN0GE92KB9vybb6ubeVlG5rOMVjQOQcP6ij6x+vv+4iXbARojDiIax4Rc4GMs5KHMu1sJ43AdESOwLEKOOMm0Vwez3WaZcpG2ZvsYAyYhH8AgFQWO7d3iAlQcVxAXcBBLrMcEhCBj6pxZ2YfZ2YcvXuUeugxz2+/Edkk7uAQE0bPfOHYPMa+C/ASVgPkLCBtYNE23PrbR0xNzwI3Denvz+qpTDPVStNwZIOD8/JoYgICoRSB8VfRlziiIk+nlhHKGM3RBAyLGEkAAflAjZ6c8egAMQARBfhvMz2qFcJkLCFh5cWIcQEiLA+P2L3X6M4Vjaij0kOL9qHrvfVdoaDyQA4Jc6gATQ4qYOP2nduE5i+iiJtZjwjStgi0VIOBwARIJtAko4uQbyfouQcODYOyTOzC91OAYfH/FOGH4E9ednmVGS8OJdkO+QyNEfJmhH4AlhGBHDelUAw/HhHqrWGw0kxAIYX8/sRkhAxWjJKMYcfYSHbcwxBxeiKt4fv5jOB9KvgTLrmA8bWBNlJuwR75O0VSfnWp3y0MHq1EMgYRNA4AsYRqMiENCHh2LMdjgvHBtGSKzBqMhIiTlcO1yP7XXzTvp26m8MkHB6pJjzAyI4DNN387sRjG1FuQeG0IZfyHgbhA5gEUgPIGxhTLhGHRA5p2j/vq+uI3IegEAiUACQAAAMHw1f/P9u5+0XlXGPsB7WgT2UkWgLbUVAYh/8KM39CsgixQv66FhNyvsRSDp6U/MCFy7V1ZgI2SSQY2ct5Q22+1GuqUB1e+6yOueNE5Du29GYIx+UtjwHCNu7dcimzw/1rFoXkAIy4zNNB6rb82fErFERkAIy4zbdOmTT52fErFERkAIy4zZNB6rb82fErFERkAIy4zbdOmTT52fErFERkAIy4zZNB6rb82fErFHpGyD1UwL1/1NHPyWwP3/OadRPCejHduoBqR/b2R8YG/djO4j+gFKRsjyY0KrKL1/xE5Z+jq4axI38OTo6i3IpcJAV6Jt3yIN8E3Q2KUAFBCSVUC4FekABAdkDN0FbkAJUQEBSCeVSoAcUEJA9cBO0BSlABQQklVAuBXpAAQHZAzdBW5ACVEBAUgnlUqAHFBCQPXATtAUpQAUEJJVQLgV6QAEB2QM3QVuQAlTgP96ZXxLDlEz6AAAAAElFTkSuQmCC" alt=""></li>
  148. <li class="topic-analysis-content" data-v-0b6ea9b4=""><span class="analysis-prefix" data-v-0b6ea9b4="">【解析】</span></li>
  149. </ul>
  150. </div>
  151. '''
  152. # cons = '''<p style="margin-top:0;margin-right:0;margin-bottom:0;margin-left:0;text-align:justify;text-justify:inter-ideograph;line-height:150%"><span style="line-height: 150%;font-size: 14px">Unbelievable</span><span style=";font-family:宋体;line-height:150%;font-size:14px"><span style="font-family:宋体">!</span></span><span style="line-height: 150%;font-size: 14px">Oh..., _____ you don&#39;t mind, I&#39;ll stop and take a deep breath.</span></p><p><br/></p>
  153. # <p>【答案】<p style="margin-top:0;margin-right:0;margin-bottom:0;margin-left:0;text-align:justify;text-justify:inter-ideograph;line-height:150%"><span style=";font-family:宋体;line-height:150%;font-size:14px">1</span><span style="line-height: 150%;font-size: 14px">.if</span><span style=";font-family:宋体;line-height:150%;font-size:14px"><span style="font-family:宋体">。考查</span></span><span style="line-height: 150%;font-size: 14px">if</span><span style=";font-family:宋体;line-height:150%;font-size:14px"><span style="font-family:宋体">引导的条件状语从句。根据句意可知。</span></span></p><p><br/></p></p>
  154. # <p>【解析】</p>'''
  155. # pprint(cons)
  156. # print(again_parse(cons))
  157. # print(again_parse(cons))
  158. # print(list(map(lambda x: str(x).replace(" ", " "), again_parse(cons))))
  159. # con1 = r'<p>解:A.研究跨栏动作时,刘翔的大小和形状不能忽略,不能看作质点,故A错误;<br/>B.选取不同的参考系,物体的运动状态是不相同的,故B错误;<br/>C.出租车收费是按路程收费的,故C错误;<br/>D.第<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553930794225.png" data-latex="${4 \rm{s} }$" width="13",height="11" />是指<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553931930702.png" data-latex="${1 \rm{s} }$" width="12",height="11" />的时间,是指从<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553930220437.png" data-latex="${3 \rm{s} }$" width="13",height="11" />末到<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553930794225.png" data-latex="${4 \rm{s} }$" width="13",height="11" />末这一段时间,故D正确;<br/>故选:D.</p>'
  160. cons = css_label_wash(cons)
  161. print(cons)