html_again_parse2.py 7.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. # -*- coding: utf-8 -*-
  2. import re
  3. from pprint import pprint
  4. from pyquery import PyQuery as pq
  5. def again_parse(content):
  6. html = pq(content, parser="html")
  7. a = []
  8. # print("###:", html.children())
  9. if html.children():
  10. for i, line in enumerate(html.children().items()):
  11. if str(line).startswith("<p") and line.text().strip():
  12. if 0 <= len(line.children()) < 2:
  13. if "<img" in str(line) or "text-decoration:underline" in str(line) or "text-decoration: underline" in str(line) or "border-bottom:" in str(line):
  14. a.append(line.html().replace("<br/>", "\n").replace("<br>", "\n"))
  15. else:
  16. if "<br>" in str(line) or "<br/>" in str(line):
  17. line = str(line).replace("<br/>", "###").replace("<br>", "###")
  18. line = pq(line)
  19. new_line = list(map(lambda x: str(x) + "\n", line.text().split("###")))
  20. a.extend(new_line)
  21. else:
  22. if line.text().strip():
  23. a.append(line.text() + "\n")
  24. elif len(line.children()) > 1:
  25. res2 = []
  26. for spans in line.children().items():
  27. if "<img" in str(spans) or "text-decoration:underline" in str(spans) or "text-decoration: underline" in str(spans) or "border-bottom:" in str(spans):
  28. if spans.attr("style") == "text-decoration:underline;" or spans.attr("style") == "text-decoration: underline;":
  29. res2.append(str(spans).replace("<br/>", "\n").replace("<br>", "\n"))
  30. else:
  31. res2.append(spans.html().replace("<br/>", "\n").replace("<br>", "\n"))
  32. else:
  33. if "<br>" in str(spans) or "<br/>" in str(spans):
  34. spans = str(spans).replace("<br/>", "###").replace("<br>", "###")
  35. spans = pq(spans)
  36. new_spans = list(map(lambda x: str(x), spans.text().split("###")))
  37. res2.extend(new_spans)
  38. else:
  39. if spans.text().strip():
  40. res2.append(spans.text())
  41. a.append(("".join(res2)+"\n"))
  42. elif str(line).startswith("<ul"):
  43. a.append(line.text().replace("\n", " ") + "\n")
  44. elif line.attr("class") == "slave-datas":
  45. for j, ss in enumerate(line.children().items()):
  46. if ss(".read-list-title").text():
  47. a.append(str(j + 1) + "." + ss(".read-list-title").text() + "\n")
  48. if ss(".read-list-opt").text():
  49. a.append(ss(".read-list-opt").text().replace("\n", "\t") + "\n")
  50. if str(ss).startswith("<ul"):
  51. a.append(ss.text() + "\n")
  52. elif str(line).startswith("<table"):
  53. a.append(line + "\n")
  54. elif str(line).startswith("<ol"):
  55. for i, ss in enumerate(line.children().items()):
  56. a.append(str(i + 1) + "." + ss.text().replace("\n", " ") + "\n")
  57. else:
  58. if line.text().strip():
  59. a.append(line.text() + "\n")
  60. else:
  61. if html.text().strip():
  62. a.append(html.text() + "\n")
  63. new_a = list(filter(lambda x: x.strip(), a))
  64. return new_a
  65. if __name__ == '__main__':
  66. cons = '''<p>It was 1504,and Columbus was making another trip to the New World.Columbus and his men needed fresh water and food after three months at sea.They saw an island and went on shore.On the island there were unfriendly Indians who refused to give food to them.Columbus’ men were afraid of the Indians,but he had a clever plan.He used sign language to tell the Indians about his mysterious (神秘的) power to turn off the light in the sky.He knew about a lunar eclipse ( 月蚀) the next night because the information was in his almanac ( 天 文 历 书).Columbus told the Indians,“Tomorrow night I’ll turn off the light in the sky.” But they didn’t believe him.When the eclipse began the next night,the Indians became very frightened.They begged Columbus to turn &nbsp; on &nbsp; the &nbsp; light &nbsp; again,and &nbsp; they &nbsp; quickly &nbsp; gave &nbsp; him &nbsp; all &nbsp; the &nbsp; food &nbsp; and &nbsp; water &nbsp; he wanted.Immediately Columbus and his men hurried back to the ship and sailed away in the moonless night.<br/></p><p><span class="ques-num">1.</span>Why did Columbus and his men stop at the island?</p><ul class="read-list-opt list-paddingleft-2"><li><p><span class="lis-opt">A.</span>Because they wanted to meet the Indians there</p></li><li><p><span class="lis-opt">B.</span>Because they hoped to get supplies of food and water</p></li><li><p><span class="lis-opt">C.</span>Because they had never been on the island before</p></li><li><p><span class="lis-opt">D.</span>Because they had planned to visit it</p></li></ul><p></p><p><span class="ques-num">2.</span>The Indians<span style="text-decoration: underline;">&nbsp; &nbsp; &nbsp;</span>Columbus and his men.</p><ul class="read-list-opt list-paddingleft-2"><li><p><span class="lis-opt">A.</span>were glad to see</p></li><li><p><span class="lis-opt">B.</span>were kind to</p></li><li><p><span class="lis-opt">C.</span>welcomed</p></li><li><p><span class="lis-opt">D.</span>were not kind to</p></li></ul><p></p><p><span class="ques-num">3.</span>How did Columbus tell the Indians that he had mysterious power?</p><ul class="read-list-opt list-paddingleft-2"><li><p><span class="lis-opt">A.</span>He used movements of hands and expressions in his face</p></li><li><p><span class="lis-opt">B.</span>He spoke in the language of the Indians</p></li><li><p><span class="lis-opt">C.</span>He drew a lot of signs</p></li><li><p><span class="lis-opt">D.</span>He wrote in the language of the Indians</p></li></ul><p></p><br/><p><span class="ques-num">4.</span>The Indians gave Columbus food and water because they&nbsp;<span style="border-bottom: 1px solid #000;color: #000;margin: 0 4px;display:inline-block;min-width: 60px;">&nbsp;</span>&nbsp;.</p><ul class="read-list-opt list-paddingleft-2"><li><p><span class="lis-opt">A.</span>believed Columbus was a man with mysterious power</p></li><li><p><span class="lis-opt">B.</span>were interested in Columbustrip</p></li><li><p><span class="lis-opt">C.</span>wanted to help Columbus</p></li><li><p><span class="lis-opt">D.</span>were clever</p></li></ul><p></p>
  67. <p>【答案】<span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">1.</span><span data-v-73bdf652="" class="dib">A</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">2.</span><span data-v-73bdf652="" class="dib">B</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">3.</span><span data-v-73bdf652="" class="dib">C</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">4.</span><span data-v-73bdf652="" class="dib">D</span></span></p>
  68. <p>【解析】大风歌</p>'''
  69. # cons = open(r"C:\Users\HJ\Desktop\TEST_FILES\888.txt", encoding="utf-8").read()
  70. # # pprint(cons)
  71. pprint(again_parse(cons))
  72. # again_parse(cons)