html_again_parse.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. # -*- coding: utf-8 -*-
  2. import re
  3. from pprint import pprint
  4. from pyquery import PyQuery as pq
  5. #
  6. # pattern = re.compile(r"\[来源.*?\]|www\..*?com")
  7. #
  8. # filter_words = ["学科网", "高考资源网", "Ziyuanku.com", "WWW.ziyuanku.com",
  9. # "高☆考♂资♀源€网", "w.w.w.k.s.5.u.c.o.m", "本资料由《七彩教育网》www.7caiedu.cn 提供!",
  10. # "本资料来源于《七彩教育网》http://www.7caiedu.cn", "本资料由《七彩教育网》www.7caiedu.cn 提",
  11. # "高考试题来源:"]
  12. #
  13. #
  14. # def filter_word(txt_list):
  15. # new_txt_list = []
  16. # for word in txt_list:
  17. # if not word.strip():
  18. # continue
  19. # new_word = re.sub(pattern, "", word)
  20. # for keys in filter_words:
  21. # if keys in new_word:
  22. # new_word = new_word.replace(keys, "")
  23. # new_txt_list.append(new_word)
  24. # return new_txt_list
  25. def filter_data(x):
  26. if not str(x).replace(" ", "").strip():
  27. pass
  28. else:
  29. return str(x)
  30. def replace_k(con):
  31. # con = str(con).replace(" ", "+")
  32. # con = str(con).replace(" ", "+")
  33. con = re.sub(r'\s(?!(src="http|_src="http|class="tiankong"|data-num=))', "#+#", str(con))
  34. return pq(con, parser="html")
  35. def css_label_wash(content):
  36. # todo add 9-4
  37. """
  38. 清洗文本中带有的css标签
  39. :param content:
  40. :return:
  41. """
  42. # temp_con = re.sub('</?p(\s*|\s+style=.*?")?>', "", str(content))
  43. if re.search('</?(span|font|article|ul|ol|div)(\s*|\s+style=.*?")>|text\s*-\s*decoration: underline|border\s*-\s*bottom:', str(content)) is None:
  44. # content = re.sub('</p\s*>', "\n", content).strip().replace("\n\n", "\n").replace("\n", "<br/>")
  45. def subp(s):
  46. cons = s.group(1)
  47. if '<p style="text-indent:' in cons or '<p style="text-align:' in cons:
  48. return cons
  49. else:
  50. return "<p>"
  51. content = re.sub('(<p(\s*|\s+style=.*?")>)', subp, content)
  52. content = re.sub('</p><p>', "</p>\n<p>", content)
  53. return content
  54. else:
  55. content = re.sub(r'<font\s+style="color: red">', "", str(content))
  56. content = re.sub(r'<span\s+style="color: red">', "", str(content))
  57. content = re.sub(r'<span\s+style="color: blue">', "", str(content))
  58. content = re.sub(r'<font\s+style="color: blue">', "", str(content))
  59. content = content.replace("</font >", "").replace("</font>", "")
  60. # content = content.replace("</span >", "").replace("</span>", "")
  61. content = content.replace('<p style="height: 0;">&nbsp;</p>', "\n").replace('<p><br/></p>', "\n")
  62. content = content.replace("</p>", "</p>\n")
  63. # content = re.sub('<p( style=.*?")?>', "", content)
  64. # content = re.sub('<br\s*/?>', "\n", content)
  65. # parm = False
  66. # if "<article>" not in str(content):
  67. # parm = True
  68. subs2img = {}
  69. if re.search('<img.*? src="', content) or "</table>" in content:
  70. all_imgs = re.findall('<img.*? src=.*?[\s/"]>|<table.*?>.*?</table>',
  71. content, flags=re.S)
  72. for k, img in enumerate(all_imgs):
  73. content = content.replace(img, "&{}&".format(k))
  74. subs2img["&{}&".format(k)] = img
  75. content = re.sub(r"<(su[bp])>(.*?)</(su[bp])>", r"【\1】\2【/\3】", content)
  76. html = pq(content, parser="html")
  77. a = []
  78. if html.children():
  79. for line in html.children().items(): # <p>.*?</p>里面的内容可能会被过滤掉
  80. test = line.text()
  81. # 保留下划线及着重符标签 <span style="text-decoration: underline;">
  82. # 波浪线:<span style="text-decoration: underline wavy;">
  83. # pq会将多个空格换成一个
  84. # print(str(line))
  85. if '<span style="text-decoration: underline' in str(line) or '<span class="dots"' in str(line)\
  86. or re.search('<(p|div)( class="[a-z\-]+")? style="text-indent:'
  87. '|<(p|div)( class="[a-z\-]+")? style="text-align:|<strong>|<em>', str(line)):
  88. line = re.sub(r'<span style="text-decoration: underline(.*?">.+?)</span>', r"【1#\1##】", str(line))
  89. line = re.sub(r'<span class="dots">(.+?)</span>', r"【2#\1##】", str(line))
  90. line = re.sub(r'<(p style="text-(indent|align):.*?">.+?)</p>', r"【\1##3】", str(line))
  91. line = re.sub(r'<div( class="[a-z\-]+")?( style="text-(indent|align):.*?")( [a-z\d\-"=]+")?(>.+?)</div>',
  92. r"【div\2\5##3】", str(line), flags=re.S)
  93. # line = re.sub(r'<(strong|em)>(.+?)</\1>', r"【\1##\2##\1】", str(line))
  94. # 分开处理会比较好,嵌套的格式也能全处理
  95. line = re.sub(r'<strong>(.+?)</strong>', r"【strong##\1##strong】", str(line))
  96. line = re.sub(r'<em>(.+?)</em>', r"【em##\1##em】", str(line))
  97. # print(line)
  98. line = line.replace(" ", "【+】")
  99. line = pq(line)
  100. new_line = list(map(lambda x: str(x).replace("【1#", '<span style="text-decoration: underline')
  101. .replace("##】", "</span>").replace("【2#", '<span class="dots">')
  102. .replace("【p【+】style=", "<p style=").replace("【div【+】style=", "<p style=")
  103. .replace("【div【+】class=", "<p class=")
  104. .replace("##3】", "</p>").replace("【strong##", "<strong>").replace("【em##", "<em>")
  105. .replace("##strong】", "</strong>").replace("##em】", "</em>").replace("【+】", " "),
  106. line.text().split("\n")))
  107. a.extend(new_line)
  108. elif str(line).startswith("<p") and line.text().strip():
  109. if '<img src="http://' in str(line) or "text-decoration: underline" in str(line) or "border-bottom:" in str(line) or "text - decoration: underline" in str(line):
  110. # a.append(line.html().replace("<br />", "\n").replace("<br/>", "\n").replace("<br>", "\n") + "\n")
  111. a.append(line.html())
  112. else:
  113. line = replace_k(line)
  114. if re.search(r"\n|<br\s*/?>", str(line), flags=re.S):
  115. line = re.sub(r"\n|<br\s*/?>", "#*#", str(line), flags=re.S)
  116. line = pq(line)
  117. new_line = list(map(lambda x: str(x).replace("#+#", " "), line.text().split("#*#")))
  118. a.extend(new_line)
  119. else:
  120. if line.text().strip():
  121. a.append(line.text().replace("#+#", " "))
  122. elif "<article>" in str(line) and "</article>" in str(line):
  123. line = re.sub(r'<p.*?>', "", str(line.html()))
  124. b = line.replace('</p>', ""). \
  125. replace("<br>", "\n"). \
  126. replace("<br/>", "\n"). \
  127. replace("<br />", "\n"). \
  128. replace('<p style="height: 0;">&nbsp;</p>', "\n"). \
  129. replace('<p style="height: 0;"> </p>', "\n")
  130. b_list = b.split("\n")
  131. # b_list = list(filter(lambda x: str(x), b_list))
  132. b_list = list(filter(filter_data, b_list))
  133. b_list = list(map(lambda x: str(x), b_list))
  134. a.extend(b_list)
  135. elif str(line).startswith("<ul"):
  136. a.append(line.text())
  137. elif line.attr("class") == "slave-datas":
  138. for j, ss in enumerate(line.children().items()):
  139. if ss(".read-list-title").text():
  140. a.append(str(j + 1) + "." + ss(".read-list-title").text())
  141. if ss(".read-list-opt").text():
  142. a.append(ss(".read-list-opt").text().replace("\n", "\t"))
  143. if str(ss).startswith("<ul"):
  144. a.append(ss.text())
  145. elif str(line).startswith("<table"):
  146. a.append(str(line))
  147. elif str(line).startswith("<ol"):
  148. for i, ss in enumerate(line.children().items()):
  149. a.append(str(i + 1) + "." + ss.text())
  150. else:
  151. # print('test:',line.text()) # 自动去掉了图片
  152. if line.text().strip():
  153. line = replace_k(line)
  154. # a.append(line.text().replace("+", "\xa0") + "\n")
  155. a.append(line.text().replace("#+#", " "))
  156. else:
  157. # if html.text().strip(): 把换行\n 都去掉了
  158. # a.append(html.text())
  159. # a.append(str(html)) # 会自动带上</?p> ;&变为&amp;
  160. a.append(content.strip())
  161. new_a = "\n".join(list(map(lambda x: x.strip(), a)))
  162. if subs2img:
  163. new_a = re.sub("|".join(subs2img.keys()), lambda x: subs2img[x.group()], new_a)
  164. # new_a = "<p>" + new_a.replace("\n\n", "\n").replace("\n", "</p>\n<p>") + "</p>"
  165. new_a = "<p>" + new_a.replace("\n\n", "\n") + "</p>"
  166. new_a = re.sub(r'<p>(<p (class|style)=.+?)</p>$', r"\1", new_a, flags=re.S)
  167. # for sb, img in subs2img.items(): # 2021
  168. # new_a = new_a.replace(sb, img)
  169. # if parm:
  170. # new_a[0] = "\xa0" * 4 + new_a[0]
  171. new_a = re.sub(r"【(/?su[bp])】", r"<\1>", new_a)
  172. return new_a
  173. def again_parse(content):
  174. # todo add 9-4
  175. content = re.sub(r'<font\s+style="color: red">', "", str(content))
  176. content = re.sub(r'<span\s+style="color: red">', "", str(content))
  177. content = re.sub(r'<span\s+style="color: blue">', "", str(content))
  178. content = re.sub(r'<font\s+style="color: blue">', "", str(content))
  179. content = str(content).replace("</font >", "").replace("</font>", "")
  180. content = str(content).replace("</span >", "").replace("</span>", "")
  181. content = str(content).replace('<p style="height: 0;">&nbsp;</p>', "\n").replace('<p><br/></p>', "\n")
  182. # parm = False
  183. # if "<article>" not in str(content):
  184. # parm = True
  185. html = pq(content, parser="html")
  186. a = []
  187. if html.children():
  188. for line in html.children().items():
  189. if (str(line).startswith("<p") and line.text().strip()) or "http://zxhx-1302712961.cos.ap" in str(line):
  190. if 'http://zsytk2.zhixinhuixue.com/static/images' in str(line) \
  191. or "text-decoration: underline" in str(line) \
  192. or "border-bottom:" in str(line) \
  193. or "text - decoration: underline" in str(line) \
  194. or 'http://zxhx-1302712961.cos.ap' in str(line):
  195. a.append(line.html().replace("<br />", "\n").replace("<br/>", "\n").replace("<br>", "\n") + "\n")
  196. else:
  197. line = replace_k(line)
  198. if "<br>" in str(line) or "<br/>" in str(line) or "<br />" in str(line):
  199. line = str(line).replace("<br/>", "###").replace("<br>", "###").replace("<br />", "###")
  200. line = pq(line)
  201. new_line = list(map(lambda x: str(x).replace("+", "\xa0") + "\n", line.text().split("###")))
  202. a.extend(new_line)
  203. else:
  204. if line.text().strip():
  205. a.append(line.text().replace("+", "\xa0") + "\n")
  206. elif "<article>" in str(line) and "</article>" in str(line):
  207. line = re.sub(r'<p.*?>', "", str(line.html()))
  208. b = line.replace('</p>', ""). \
  209. replace("<br>", "\n"). \
  210. replace("<br/>", "\n"). \
  211. replace("<br />", "\n"). \
  212. replace('<p style="height: 0;">&nbsp;</p>', "\n"). \
  213. replace('<p style="height: 0;"> </p>', "\n")
  214. b_list = b.split("\n")
  215. # b_list = list(filter(lambda x: str(x), b_list))
  216. b_list = list(filter(filter_data, b_list))
  217. b_list = list(map(lambda x: str(x) + "\n", b_list))
  218. a.extend(b_list)
  219. elif str(line).startswith("<ul"):
  220. a.append(line.text() + "\n")
  221. elif line.attr("class") == "slave-datas":
  222. for j, ss in enumerate(line.children().items()):
  223. if ss(".read-list-title").text():
  224. a.append(str(j + 1) + "." + ss(".read-list-title").text() + "\n")
  225. if ss(".read-list-opt").text():
  226. a.append(ss(".read-list-opt").text().replace("\n", "\t") + "\n")
  227. if str(ss).startswith("<ul"):
  228. a.append(ss.text() + "\n")
  229. elif str(line).startswith("<table"):
  230. a.append(str(line) + "\n")
  231. elif str(line).startswith("<ol"):
  232. for i, ss in enumerate(line.children().items()):
  233. a.append(str(i + 1) + "." + ss.text() + "\n")
  234. else:
  235. if line.text().strip():
  236. line = replace_k(line)
  237. a.append(line.text().replace("+", "\xa0") + "\n")
  238. else:
  239. if html.text().strip():
  240. a.append(html.text() + "\n")
  241. a = "\n".join(a).split("\n")
  242. new_a = list(filter(lambda x: x.strip(), a))
  243. # if parm:
  244. # new_a[0] = "\xa0" * 4 + new_a[0]
  245. return new_a
  246. if __name__ == '__main__':
  247. cons = r'''
  248. <div class="stem-wraper" data-v-494b33d9=""><span class="stem" data-v-494b33d9="">下述有关功和能量说法正确的是( )</span></div>
  249. <ul class="stem-options" data-v-494b33d9="">
  250. <li data-v-494b33d9=""><span class="analysis-prefix" data-v-494b33d9="">A.</span><span data-v-494b33d9="">物体做功越多,<span style="text-decoration: underline wavy;">物体的能量就越大</span></span></li>
  251. <li data-v-494b33d9=""><span class="analysis-prefix" data-v-494b33d9="">B.</span><span data-v-494b33d9="">摩擦力可能对物体做正功,也可能做负功,<span style="text-decoration: underline;">也可以不做功</span></span></li>
  252. <li data-v-494b33d9=""><span class="analysis-prefix" data-v-494b33d9="">C.</span><span data-v-494b33d9="">能量耗散表明,能量守恒定律有些情况下并不成立</span></li>
  253. <li data-v-494b33d9=""><span class="analysis-prefix" data-v-494b33d9="">D.</span><span data-v-494b33d9="">弹簧拉伸时的弹性势能一定大于压缩时的弹性势能</span></li>
  254. </ul>
  255. <div class="topic-analysis" data-v-494b33d9="">
  256. <div class="topic-analysis-content" data-v-494b33d9=""><span class="analysis-prefix" data-v-494b33d9="">【答案】</span><span data-v-494b33d9="">B</span></div>
  257. <div class="topic-analysis-content" data-v-494b33d9=""><span class="analysis-prefix" data-v-494b33d9="">【解析】</span><span data-v-494b33d9="">功是能量转化的量度,物体做功越多,物体的能量转化就越多,而不是能量越大。故A错误;摩擦力方向可能与物体运动方向相同、也与物体运动方向相反,所以摩擦力可能对物体做正功,也可能做负功。物体也可能没有位移,摩擦力不做功,故B正确;能量耗散虽然不会使能的总量减少,但能量的可利用率越来越低,即能量的品质越来越低;根据能量守恒定律可知,虽然能量的可利用率越来越低,但能量总和保持不变,仍然遵守能量守恒定律,故C错误;弹簧的弹性势能与形变量有关,弹簧拉伸时与压缩时弹性势能可能相等,也可能拉伸时的弹性势能小于压缩时的弹性势能。故D错误。故选B。</span></div>
  258. </div>
  259. '''
  260. # cons = '''<p style="margin-top:0;margin-right:0;margin-bottom:0;margin-left:0;text-align:justify;text-justify:inter-ideograph;line-height:150%"><span style="line-height: 150%;font-size: 14px">Unbelievable</span><span style=";font-family:宋体;line-height:150%;font-size:14px"><span style="font-family:宋体">!</span></span><span style="line-height: 150%;font-size: 14px">Oh..., _____ you don&#39;t mind, I&#39;ll stop and take a deep breath.</span></p><p><br/></p>
  261. # <p>【答案】<p style="margin-top:0;margin-right:0;margin-bottom:0;margin-left:0;text-align:justify;text-justify:inter-ideograph;line-height:150%"><span style=";font-family:宋体;line-height:150%;font-size:14px">1</span><span style="line-height: 150%;font-size: 14px">.if</span><span style=";font-family:宋体;line-height:150%;font-size:14px"><span style="font-family:宋体">。考查</span></span><span style="line-height: 150%;font-size: 14px">if</span><span style=";font-family:宋体;line-height:150%;font-size:14px"><span style="font-family:宋体">引导的条件状语从句。根据句意可知。</span></span></p><p><br/></p></p>
  262. # <p>【解析】</p>'''
  263. # pprint(cons)
  264. # print(again_parse(cons))
  265. # print(again_parse(cons))
  266. # print(list(map(lambda x: str(x).replace(" ", " "), again_parse(cons))))
  267. # con1 = r'<p>解:A.研究跨栏动作时,刘翔的大小和形状不能忽略,不能看作质点,故A错误;<br/>B.选取不同的参考系,物体的运动状态是不相同的,故B错误;<br/>C.出租车收费是按路程收费的,故C错误;<br/>D.第<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553930794225.png" data-latex="${4 \rm{s} }$" width="13",height="11" />是指<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553931930702.png" data-latex="${1 \rm{s} }$" width="12",height="11" />的时间,是指从<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553930220437.png" data-latex="${3 \rm{s} }$" width="13",height="11" />末到<img src="http://192.168.1.145:10811/static/physical_formulas_imgs/16184553930794225.png" data-latex="${4 \rm{s} }$" width="13",height="11" />末这一段时间,故D正确;<br/>故选:D.</p>'
  268. cons1 = """
  269. <div class="stem-wraper" data-v-00256703="">
  270. <p>阅读下面这首宋诗,完成下列小题。(本题共2小题,9分)</p>
  271. <p style="text-align: center;"><span style="font-family: 楷体;">除夜野宿常州城外二首(其二)</span></p>
  272. <p style="text-align: center;"><span style="font-family: 楷体;">苏轼</span></p>
  273. <p style="text-align: center;"><span style="font-family: 楷体;">南来三见岁云徂<sup>①</sup>,直恐终身走道途。</span></p>
  274. <p style="text-align: center;"><span style="font-family: 楷体;">老去怕看新历日,退归拟学旧桃符。</span></p>
  275. <p style="text-align: center;"><span style="font-family: 楷体;">烟花已作青春<sup>②</sup>意,霜雪偏寻病客须。</span></p>
  276. <p style="text-align: center;"><span style="font-family: 楷体;">但把穷愁博长健,不辞最后饮屠苏<sup>③</sup>。</span></p>
  277. <p>【注】①苏轼于熙宁四年(1071)冬到杭州任通判,至作此诗,已度过三个除夕。岁云徂,谓年岁辞去。徂,往。②青春:春季。③古俗,正月初一家人先幼后长依次饮屠苏酒。《时镜新书》晋董勋云:“正旦饮酒先从小者,何也?勋曰:‘俗以小者得岁,故先酒贺之,老者失时,故后饮酒。’”</p>
  278. </div>
  279. <div class="slave" data-v-00256703="">
  280. <div class="slave-item" data-v-00256703="">
  281. <div class="content" data-v-00256703=""><span class="analysis-prefix" data-v-00256703="">14</span><span data-v-00256703="">.</span><span data-v-00256703="">下列对这首诗的理解和赏析,不正确的一项是( )</span></div>
  282. <ul class="stem-options" data-v-00256703="">
  283. <li data-v-00256703=""><span class="analysis-prefix" data-v-00256703="">A.</span><span data-v-00256703="">诗人离开朝廷南来已三年,恐怕自己终身奔走于宦途而不能践偿其政治抱负。</span></li>
  284. <li data-v-00256703=""><span class="analysis-prefix" data-v-00256703="">B.</span><span data-v-00256703="">因桃符一年一换,诗人自比“桃符”,寄托了自己在新的一年仕途晋升的愿望。</span></li>
  285. <li data-v-00256703=""><span class="analysis-prefix" data-v-00256703="">C.</span><span data-v-00256703="">颈联对仗工整,诗人以自然界万物复苏的繁丽景象来反衬出自己的衰病老迈。</span></li>
  286. <li data-v-00256703=""><span class="analysis-prefix" data-v-00256703="">D.</span><span data-v-00256703="">本诗抒发了除夜感慨,“桃符”“烟花”“屠苏”等意象照应节令,节日氛围浓厚。</span></li>
  287. </ul>
  288. </div>
  289. <div class="slave-item" data-v-00256703="">
  290. <div class="content" data-v-00256703=""><span class="analysis-prefix" data-v-00256703="">15</span><span data-v-00256703="">.</span><span data-v-00256703="">末句“不辞最后饮屠苏”意蕴丰富,请简要分析。</span></div>
  291. </div>
  292. </div>
  293. <div class="topic-analysis" data-v-00256703="">
  294. <div class="topic-analysis-content" data-v-00256703=""><span class="analysis-prefix" data-v-00256703="">【答案】</span><span data-v-00256703=""><br><span class="analysis-prefix">14.</span>&nbsp;&nbsp;B<br><span class="analysis-prefix">15.</span>&nbsp;&nbsp;①饮屠苏酒的顺序是自少至老,诗人说“最后饮”,表明了他年事已高。②诗人“辞”掉的是富贵荣华,表明他不再以仕途不畅为意。③“不辞最后饮”又表明他不以岁月流逝为意,以豁达乐观待之,表现了诗人豪放旷达的情怀。<br></span></div>
  295. <div class="topic-analysis-content" data-v-00256703=""><span class="analysis-prefix" data-v-00256703="">【解析】</span><span data-v-00256703=""><br><span class="analysis-prefix">14.</span>&nbsp;&nbsp;本题考查学生鉴赏诗歌的形象、表达技巧和情感的能力。B.“寄托了自己在新的一年仕途晋升的愿望”曲解文意。由“退归”可知,本句暗指诗人要抛却不如意的仕途,含有退隐之意。故选B。<br><span class="analysis-prefix">15.</span>&nbsp;&nbsp;本题考查学生理解诗句意蕴的能力。诗人一扫前面的郁闷,表示要用“穷”和“愁”换取长久的健康,要屠苏酒来迎新年。正月初一饮屠苏酒是一种习俗,饮用的顺序是自少至老,诗人说“最后饮”,表明了他年事已高。“烟花”二句,以自然界万物复苏的繁丽景象,反衬自己的衰病老迈。“拟学旧桃符”暗指诗人要抛却不如意的仕途,诗人“辞”掉的是富贵荣华,表明他不再以仕途不畅为意。诗人说“不辞最后饮”,不怕轮到我最后一个把屠苏酒饮,表明他不以岁月流逝为意,以豁达乐观待之,以此观照开篇,更见苏轼豪放旷达的情怀。<br></span></div>
  296. </div>
  297. """
  298. conss = css_label_wash(cons1)
  299. print(conss)