html_again_parse_old.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. # -*- coding: utf-8 -*-
  2. import re
  3. from pprint import pprint
  4. from pyquery import PyQuery as pq
  5. #
  6. # pattern = re.compile(r"\[来源.*?\]|www\..*?com")
  7. #
  8. # filter_words = ["学科网", "高考资源网", "Ziyuanku.com", "WWW.ziyuanku.com",
  9. # "高☆考♂资♀源€网", "w.w.w.k.s.5.u.c.o.m", "本资料由《七彩教育网》www.7caiedu.cn 提供!",
  10. # "本资料来源于《七彩教育网》http://www.7caiedu.cn", "本资料由《七彩教育网》www.7caiedu.cn 提",
  11. # "高考试题来源:"]
  12. #
  13. #
  14. # def filter_word(txt_list):
  15. # new_txt_list = []
  16. # for word in txt_list:
  17. # if not word.strip():
  18. # continue
  19. # new_word = re.sub(pattern, "", word)
  20. # for keys in filter_words:
  21. # if keys in new_word:
  22. # new_word = new_word.replace(keys, "")
  23. # new_txt_list.append(new_word)
  24. # return new_txt_list
  25. def filter_data(x):
  26. if not str(x).replace(" ", "").strip():
  27. pass
  28. else:
  29. return str(x)
  30. def replace_k(con):
  31. # con = str(con).replace(" ", "+")
  32. # con = str(con).replace(" ", "+")
  33. con = re.sub(r'\s(?!(src="http|_src="http|class="tiankong"|data-num=))', "+", str(con))
  34. return pq(con, parser="html")
  35. def again_parse(content):
  36. # todo add 9-4
  37. content = re.sub(r'<font\s+style="color: red">', "", str(content))
  38. content = re.sub(r'<span\s+style="color: red">', "", str(content))
  39. content = re.sub(r'<span\s+style="color: blue">', "", str(content))
  40. content = re.sub(r'<font\s+style="color: blue">', "", str(content))
  41. content = str(content).replace("</font >", "").replace("</font>", "")
  42. content = str(content).replace("</span >", "").replace("</span>", "")
  43. content = str(content).replace('<p style="height: 0;">&nbsp;</p>', "\n").replace('<p><br/></p>', "\n")
  44. # parm = False
  45. # if "<article>" not in str(content):
  46. # parm = True
  47. html = pq(content, parser="html")
  48. a = []
  49. if html.children():
  50. for line in html.children().items():
  51. if str(line).startswith("<p") and line.text().strip():
  52. if '<img src="http://zsytk2.zhixinhuixue.com/static/images' in str(line) or "text-decoration: underline" in str(line) or "border-bottom:" in str(line) or "text - decoration: underline" in str(line):
  53. a.append(line.html().replace("<br />", "\n").replace("<br/>", "\n").replace("<br>", "\n") + "\n")
  54. else:
  55. line = replace_k(line)
  56. if "<br>" in str(line) or "<br/>" in str(line) or "<br />" in str(line):
  57. line = str(line).replace("<br/>", "###").replace("<br>", "###").replace("<br />", "###")
  58. line = pq(line)
  59. new_line = list(map(lambda x: str(x).replace("+", "\xa0") + "\n", line.text().split("###")))
  60. a.extend(new_line)
  61. else:
  62. if line.text().strip():
  63. a.append(line.text().replace("+", "\xa0") + "\n")
  64. elif "<article>" in str(line) and "</article>" in str(line):
  65. line = re.sub(r'<p.*?>', "", str(line.html()))
  66. b = line.replace('</p>', ""). \
  67. replace("<br>", "\n"). \
  68. replace("<br/>", "\n"). \
  69. replace("<br />", "\n"). \
  70. replace('<p style="height: 0;">&nbsp;</p>', "\n"). \
  71. replace('<p style="height: 0;"> </p>', "\n")
  72. b_list = b.split("\n")
  73. # b_list = list(filter(lambda x: str(x), b_list))
  74. b_list = list(filter(filter_data, b_list))
  75. b_list = list(map(lambda x: str(x) + "\n", b_list))
  76. a.extend(b_list)
  77. elif str(line).startswith("<ul"):
  78. a.append(line.text() + "\n")
  79. elif line.attr("class") == "slave-datas":
  80. for j, ss in enumerate(line.children().items()):
  81. if ss(".read-list-title").text():
  82. a.append(str(j + 1) + "." + ss(".read-list-title").text() + "\n")
  83. if ss(".read-list-opt").text():
  84. a.append(ss(".read-list-opt").text().replace("\n", "\t") + "\n")
  85. if str(ss).startswith("<ul"):
  86. a.append(ss.text() + "\n")
  87. elif str(line).startswith("<table"):
  88. a.append(str(line) + "\n")
  89. elif str(line).startswith("<ol"):
  90. for i, ss in enumerate(line.children().items()):
  91. a.append(str(i + 1) + "." + ss.text() + "\n")
  92. else:
  93. if line.text().strip():
  94. line = replace_k(line)
  95. a.append(line.text().replace("+", "\xa0") + "\n")
  96. else:
  97. if html.text().strip():
  98. a.append(html.text() + "\n")
  99. new_a = list(filter(lambda x: x.strip(), a))
  100. # if parm:
  101. # new_a[0] = "\xa0" * 4 + new_a[0]
  102. return new_a
  103. if __name__ == '__main__':
  104. cons = '''<p>____ prize&nbsp;for&nbsp;the&nbsp;winner&nbsp;of&nbsp;the&nbsp;competition&nbsp;is&nbsp;____&nbsp;two-week&nbsp;holiday&nbsp;in&nbsp;Paris.</p><ul class="option-datas list-paddingleft-2"><li><p><span class="lis-opt">A.</span>The;&nbsp;/</p></li><li><p><span class="lis-opt">B.</span>A;&nbsp;/</p></li><li><p><span class="lis-opt">C.</span>A;&nbsp;the</p></li><li><p><span class="lis-opt">D.</span>The;&nbsp;a</p></li></ul>
  105. <p>【答案】D</p>
  106. <p>【解析】考查冠词,第一个空格表示对比赛获胜者的奖励,是特指,故用the,第二个空格泛指一次为期两周的假期,用不定冠词a,故选D项。句意,这次比赛给获胜者的奖励是在巴黎度假两周。</p>'''
  107. # cons = '''<p style="margin-top:0;margin-right:0;margin-bottom:0;margin-left:0;text-align:justify;text-justify:inter-ideograph;line-height:150%"><span style="line-height: 150%;font-size: 14px">Unbelievable</span><span style=";font-family:宋体;line-height:150%;font-size:14px"><span style="font-family:宋体">!</span></span><span style="line-height: 150%;font-size: 14px">Oh..., _____ you don&#39;t mind, I&#39;ll stop and take a deep breath.</span></p><p><br/></p>
  108. # <p>【答案】<p style="margin-top:0;margin-right:0;margin-bottom:0;margin-left:0;text-align:justify;text-justify:inter-ideograph;line-height:150%"><span style=";font-family:宋体;line-height:150%;font-size:14px">1</span><span style="line-height: 150%;font-size: 14px">.if</span><span style=";font-family:宋体;line-height:150%;font-size:14px"><span style="font-family:宋体">。考查</span></span><span style="line-height: 150%;font-size: 14px">if</span><span style=";font-family:宋体;line-height:150%;font-size:14px"><span style="font-family:宋体">引导的条件状语从句。根据句意可知。</span></span></p><p><br/></p></p>
  109. # <p>【解析】</p>'''
  110. # pprint(cons)
  111. # print(again_parse(cons))
  112. # print(again_parse(cons))
  113. # 2021_04_14_09_38_55
  114. path2 = r"F:\zwj\Text_Structure\fail_files\2021_04_14_09_38_55.html"
  115. html = open(path2, "r", encoding="utf-8").read()
  116. # print(html)
  117. print(list(map(lambda x: str(x).replace(" ", " "), again_parse(html))))