html_again_parse.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. # -*- coding: utf-8 -*-
  2. import re
  3. from pprint import pprint
  4. from pyquery import PyQuery as pq
  5. pattern = re.compile(r"\[来源.*?\]")
  6. filter_words = ["学科网", "高考资源网", "Ziyuanku.com", "WWW.ziyuanku.com",
  7. "高☆考♂资♀源€网", "w.w.w.k.s.5.u.c.o.m", "本资料由《七彩教育网》www.7caiedu.cn 提供!",
  8. "本资料来源于《七彩教育网》http://www.7caiedu.cn", "本资料由《七彩教育网》www.7caiedu.cn 提",
  9. "高考试题来源:"]
  10. def filter_word(txt_list):
  11. new_txt_list = []
  12. for word in txt_list:
  13. if not word.strip():
  14. continue
  15. new_word = re.sub(pattern, "", word)
  16. for keys in filter_words:
  17. if keys in new_word:
  18. new_word = new_word.replace(keys, "")
  19. new_txt_list.append(new_word)
  20. return new_txt_list
  21. def filter_data(x):
  22. if not str(x).replace(" ","").strip():
  23. pass
  24. else:
  25. return str(x)
  26. def again_parse2(content):
  27. # todo add 9-4
  28. content = re.sub(r'<font style="color: red">', "", str(content))
  29. content = re.sub(r'<font style="color: blue">', "", str(content))
  30. content = str(content).replace("</font>", "")
  31. content = str(content).replace('<p style="height: 0;">&nbsp;</p>', "\n")
  32. parm = False
  33. if "<article>" not in str(content):
  34. parm = True
  35. html = pq(content, parser="html")
  36. a = []
  37. if html.children():
  38. for line in html.children().items():
  39. if str(line).startswith("<p") and line.text().strip():
  40. if '<img src="http://zsytk2.zhixinhuixue.com/static/images' in str(
  41. line) or "text-decoration: underline" in str(line) or "border-bottom:" in str(line):
  42. a.append(line.html().replace("<br />", "\n").replace("<br/>", "\n").replace("<br>", "\n") + "\n")
  43. else:
  44. if "<br>" in str(line) or "<br/>" in str(line) or "<br />" in str(line):
  45. line = str(line).replace("<br/>", "###").replace("<br>", "###").replace("<br />", "###")
  46. line = pq(line)
  47. new_line = list(map(lambda x: str(x) + "\n", line.text().split("###")))
  48. a.extend(new_line)
  49. else:
  50. if line.text().strip():
  51. a.append(line.text() + "\n")
  52. elif str(line).startswith("<article>"):
  53. line = re.sub(r'<p.*?>', "", str(line.html()))
  54. b = line.replace('</p>', ""). \
  55. replace("<br>", "\n"). \
  56. replace("<br/>", "\n"). \
  57. replace("<br />", "\n"). \
  58. replace('<p style="height: 0;">&nbsp;</p>', "\n").\
  59. replace('<p style="height: 0;"> </p>', "\n")
  60. b_list = b.split("\n")
  61. # b_list = list(filter(lambda x: str(x), b_list))
  62. b_list = list(filter(filter_data, b_list))
  63. b_list = list(map(lambda x: str(x) + "\n", b_list))
  64. a.extend(b_list)
  65. elif str(line).startswith("<ul"):
  66. a.append(line.text().replace("\n", " ") + "\n")
  67. elif line.attr("class") == "slave-datas":
  68. for j, ss in enumerate(line.children().items()):
  69. if ss(".read-list-title").text():
  70. a.append(str(j + 1) + "." + ss(".read-list-title").text() + "\n")
  71. if ss(".read-list-opt").text():
  72. a.append(ss(".read-list-opt").text().replace("\n", "\t") + "\n")
  73. if str(ss).startswith("<ul"):
  74. a.append(ss.text() + "\n")
  75. elif str(line).startswith("<table"):
  76. a.append(str(line) + "\n")
  77. elif str(line).startswith("<ol"):
  78. for i, ss in enumerate(line.children().items()):
  79. a.append(str(i + 1) + "." + ss.text().replace("\n", " ") + "\n")
  80. else:
  81. if line.text().strip():
  82. a.append(line.text() + "\n")
  83. else:
  84. if html.text().strip():
  85. a.append(html.text() + "\n")
  86. new_a = list(filter(lambda x: x.strip(), a))
  87. if parm:
  88. new_a[0] = "\xa0"*4 + new_a[0]
  89. return new_a
  90. def replace_k(con):
  91. con = str(con).replace("&nbsp;", "+")
  92. return pq(con, parser="html")
  93. def again_parse(content):
  94. # todo add 9-4
  95. content = re.sub(r'<font style="color: red">', "", str(content))
  96. content = re.sub(r'<font style="color: blue">', "", str(content))
  97. content = str(content).replace("</font>", "")
  98. content = str(content).replace('<p style="height: 0;">&nbsp;</p>', "\n")
  99. parm = False
  100. if "<article>" not in str(content):
  101. parm = True
  102. html = pq(content, parser="html")
  103. a = []
  104. if html.children():
  105. for line in html.children().items():
  106. if str(line).startswith("<p") and line.text().strip():
  107. if '<img src="http://zsytk2.zhixinhuixue.com/static/images' in str(line) or "text-decoration: underline" in str(line) or "border-bottom:" in str(line) or "text - decoration: underline" in str(line):
  108. a.append(line.html().replace("<br />", "\n").replace("<br/>", "\n").replace("<br>", "\n") + "\n")
  109. else:
  110. line = replace_k(line)
  111. if "<br>" in str(line) or "<br/>" in str(line) or "<br />" in str(line):
  112. line = str(line).replace("<br/>", "###").replace("<br>", "###").replace("<br />", "###")
  113. line = pq(line)
  114. new_line = list(map(lambda x: str(x).replace("+", "\xa0") + "\n", line.text().split("###")))
  115. a.extend(new_line)
  116. else:
  117. if line.text().strip():
  118. a.append(line.text().replace("+", "\xa0") + "\n")
  119. elif str(line).startswith("<article>"):
  120. line = re.sub(r'<p.*?>', "", str(line.html()))
  121. b = line.replace('</p>', ""). \
  122. replace("<br>", "\n"). \
  123. replace("<br/>", "\n"). \
  124. replace("<br />", "\n"). \
  125. replace('<p style="height: 0;">&nbsp;</p>', "\n"). \
  126. replace('<p style="height: 0;"> </p>', "\n")
  127. b_list = b.split("\n")
  128. # b_list = list(filter(lambda x: str(x), b_list))
  129. b_list = list(filter(filter_data, b_list))
  130. b_list = list(map(lambda x: str(x) + "\n", b_list))
  131. a.extend(b_list)
  132. elif str(line).startswith("<ul"):
  133. a.append(line.text() + "\n")
  134. elif line.attr("class") == "slave-datas":
  135. for j, ss in enumerate(line.children().items()):
  136. if ss(".read-list-title").text():
  137. a.append(str(j + 1) + "." + ss(".read-list-title").text() + "\n")
  138. if ss(".read-list-opt").text():
  139. a.append(ss(".read-list-opt").text().replace("\n", "\t") + "\n")
  140. if str(ss).startswith("<ul"):
  141. a.append(ss.text() + "\n")
  142. elif str(line).startswith("<table"):
  143. a.append(str(line) + "\n")
  144. elif str(line).startswith("<ol"):
  145. for i, ss in enumerate(line.children().items()):
  146. a.append(str(i + 1) + "." + ss.text() + "\n")
  147. else:
  148. if line.text().strip():
  149. line = replace_k(line)
  150. a.append(line.text().replace("+", "\xa0") + "\n")
  151. else:
  152. if html.text().strip():
  153. a.append(html.text() + "\n")
  154. new_a = list(filter(lambda x: x.strip(), a))
  155. # if parm:
  156. # new_a[0] = "\xa0" * 4 + new_a[0]
  157. return filter_word(new_a)
  158. if __name__ == '__main__':
  159. cons = '''<p>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Amelia Earhart was <font style="color: red">borm</font> in 1897 in Kansas She and her younger sister Muriel were very active. They</p><p style="height: 0;">&nbsp;</p>&nbsp;&nbsp;&nbsp;rode horses, played baseball and basketball.<p style="height: 0;">&nbsp;</p>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;In 1916 Amelia completed high school and then prepared to enter<img src="http://zsytk2.zhixinhuixue.com/static/images/1.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/1.gif" class="tiankong" data-num="0">university.<p style="height: 0;">&nbsp;</p>&nbsp;&nbsp;&nbsp;During a holiday, she visited<img src="http://zsytk2.zhixinhuixue.com/static/images/2.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/2.gif" class="tiankong" data-num="1">(she)sister in Toronto<font style="color: blue">,Canada</font>. World War One<img src="http://zsytk2.zhixinhuixue.com/static/images/3.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/3.gif" class="tiankong" data-num="2">(begin)by then.<p style="height: 0;">&nbsp;</p>&nbsp;&nbsp;&nbsp;And Amelia<img src="http://zsytk2.zhixinhuixue.com/static/images/4.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/4.gif" class="tiankong" data-num="3">(shock)by the number of wounded soldiers<font style="color: blue">,_</font><img src="http://zsytk2.zhixinhuixue.com/static/images/5.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/5.gif" class="tiankong" data-num="4">were sent home from the fighting in<p style="height: 0;">&nbsp;</p>&nbsp;&nbsp;&nbsp;France. She decided that she would be more<img src="http://zsytk2.zhixinhuixue.com/static/images/6.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/6.gif" class="tiankong" data-num="5">(use)as a nurse than as a student. So she joined the Red<p style="height: 0;">&nbsp;</p>&nbsp;&nbsp;&nbsp;Cross.<p style="height: 0;">&nbsp;</p>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Amelia Earhart first became interested in flying when<img src="http://zsytk2.zhixinhuixue.com/static/images/7.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/7.gif" class="tiankong" data-num="6">(live)in Toronto So she wanted to learn to<p style="height: 0;">&nbsp;</p>&nbsp;&nbsp;&nbsp;fly. Then she<img src="http://zsytk2.zhixinhuixue.com/static/images/8.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/8.gif" class="tiankong" data-num="7">(receive)her official pilot's license. On May <font style="color: red">20th</font>, 1932,Amelia took<img src="http://zsytk2.zhixinhuixue.com/static/images/9.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/9.gif" class="tiankong" data-num="8">from<p style="height: 0;">&nbsp;</p>&nbsp;&nbsp;&nbsp;Newfoundland in a small red and gold plane. And after fifteen hours she landed in Ireland. She became the<p style="height: 0;">&nbsp;</p>&nbsp;&nbsp;&nbsp;first woman<img src="http://zsytk2.zhixinhuixue.com/static/images/10.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/10.gif" class="tiankong" data-num="9">(<font style="color: red">ly</font>)across the Atlantic Ocean alone. <font style="color: blue"> </font></p>
  160. <p>【答案】<span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">1.</span><span data-v-73bdf652="" class="dib">a</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">2.</span><span data-v-73bdf652="" class="dib">her</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">3.</span><span data-v-73bdf652="" class="dib">had begun</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">4.</span><span data-v-73bdf652="" class="dib">was shocked</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">5.</span><span data-v-73bdf652="" class="dib">who</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">6.</span><span data-v-73bdf652="" class="dib">useful</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">7.</span><span data-v-73bdf652="" class="dib">living</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">8.</span><span data-v-73bdf652="" class="dib">received</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">9.</span><span data-v-73bdf652="" class="dib">off</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">10.</span><span data-v-73bdf652="" class="dib">to fly</span></span></p>
  161. <p>【解析】<p>asdasdasd</p></p>'''
  162. # pprint(cons)
  163. # print(again_parse(cons))
  164. pprint(again_parse(cons))