123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186 |
- # -*- coding: utf-8 -*-
- import re
- from pprint import pprint
- from pyquery import PyQuery as pq
- pattern = re.compile(r"\[来源.*?\]")
- filter_words = ["学科网", "高考资源网", "Ziyuanku.com", "WWW.ziyuanku.com",
- "高☆考♂资♀源€网", "w.w.w.k.s.5.u.c.o.m", "本资料由《七彩教育网》www.7caiedu.cn 提供!",
- "本资料来源于《七彩教育网》http://www.7caiedu.cn", "本资料由《七彩教育网》www.7caiedu.cn 提",
- "高考试题来源:"]
- def filter_word(txt_list):
- new_txt_list = []
- for word in txt_list:
- if not word.strip():
- continue
- new_word = re.sub(pattern, "", word)
- for keys in filter_words:
- if keys in new_word:
- new_word = new_word.replace(keys, "")
- new_txt_list.append(new_word)
- return new_txt_list
- def filter_data(x):
- if not str(x).replace(" ","").strip():
- pass
- else:
- return str(x)
- def again_parse2(content):
- # todo add 9-4
- content = re.sub(r'<font style="color: red">', "", str(content))
- content = re.sub(r'<font style="color: blue">', "", str(content))
- content = str(content).replace("</font>", "")
- content = str(content).replace('<p style="height: 0;"> </p>', "\n")
- parm = False
- if "<article>" not in str(content):
- parm = True
- html = pq(content, parser="html")
- a = []
- if html.children():
- for line in html.children().items():
- if str(line).startswith("<p") and line.text().strip():
- if '<img src="http://zsytk2.zhixinhuixue.com/static/images' in str(
- line) or "text-decoration: underline" in str(line) or "border-bottom:" in str(line):
- a.append(line.html().replace("<br />", "\n").replace("<br/>", "\n").replace("<br>", "\n") + "\n")
- else:
- if "<br>" in str(line) or "<br/>" in str(line) or "<br />" in str(line):
- line = str(line).replace("<br/>", "###").replace("<br>", "###").replace("<br />", "###")
- line = pq(line)
- new_line = list(map(lambda x: str(x) + "\n", line.text().split("###")))
- a.extend(new_line)
- else:
- if line.text().strip():
- a.append(line.text() + "\n")
- elif str(line).startswith("<article>"):
- line = re.sub(r'<p.*?>', "", str(line.html()))
- b = line.replace('</p>', ""). \
- replace("<br>", "\n"). \
- replace("<br/>", "\n"). \
- replace("<br />", "\n"). \
- replace('<p style="height: 0;"> </p>', "\n").\
- replace('<p style="height: 0;"> </p>', "\n")
- b_list = b.split("\n")
- # b_list = list(filter(lambda x: str(x), b_list))
- b_list = list(filter(filter_data, b_list))
- b_list = list(map(lambda x: str(x) + "\n", b_list))
- a.extend(b_list)
- elif str(line).startswith("<ul"):
- a.append(line.text().replace("\n", " ") + "\n")
- elif line.attr("class") == "slave-datas":
- for j, ss in enumerate(line.children().items()):
- if ss(".read-list-title").text():
- a.append(str(j + 1) + "." + ss(".read-list-title").text() + "\n")
- if ss(".read-list-opt").text():
- a.append(ss(".read-list-opt").text().replace("\n", "\t") + "\n")
- if str(ss).startswith("<ul"):
- a.append(ss.text() + "\n")
- elif str(line).startswith("<table"):
- a.append(str(line) + "\n")
- elif str(line).startswith("<ol"):
- for i, ss in enumerate(line.children().items()):
- a.append(str(i + 1) + "." + ss.text().replace("\n", " ") + "\n")
- else:
- if line.text().strip():
- a.append(line.text() + "\n")
- else:
- if html.text().strip():
- a.append(html.text() + "\n")
- new_a = list(filter(lambda x: x.strip(), a))
- if parm:
- new_a[0] = "\xa0"*4 + new_a[0]
- return new_a
- def replace_k(con):
- con = str(con).replace(" ", "+")
- return pq(con, parser="html")
- def again_parse(content):
- # todo add 9-4
- content = re.sub(r'<font style="color: red">', "", str(content))
- content = re.sub(r'<font style="color: blue">', "", str(content))
- content = str(content).replace("</font>", "")
- content = str(content).replace('<p style="height: 0;"> </p>', "\n")
- parm = False
- if "<article>" not in str(content):
- parm = True
- html = pq(content, parser="html")
- a = []
- if html.children():
- for line in html.children().items():
- if str(line).startswith("<p") and line.text().strip():
- if '<img src="http://zsytk2.zhixinhuixue.com/static/images' in str(line) or "text-decoration: underline" in str(line) or "border-bottom:" in str(line) or "text - decoration: underline" in str(line):
- a.append(line.html().replace("<br />", "\n").replace("<br/>", "\n").replace("<br>", "\n") + "\n")
- else:
- line = replace_k(line)
- if "<br>" in str(line) or "<br/>" in str(line) or "<br />" in str(line):
- line = str(line).replace("<br/>", "###").replace("<br>", "###").replace("<br />", "###")
- line = pq(line)
- new_line = list(map(lambda x: str(x).replace("+", "\xa0") + "\n", line.text().split("###")))
- a.extend(new_line)
- else:
- if line.text().strip():
- a.append(line.text().replace("+", "\xa0") + "\n")
- elif str(line).startswith("<article>"):
- line = re.sub(r'<p.*?>', "", str(line.html()))
- b = line.replace('</p>', ""). \
- replace("<br>", "\n"). \
- replace("<br/>", "\n"). \
- replace("<br />", "\n"). \
- replace('<p style="height: 0;"> </p>', "\n"). \
- replace('<p style="height: 0;"> </p>', "\n")
- b_list = b.split("\n")
- # b_list = list(filter(lambda x: str(x), b_list))
- b_list = list(filter(filter_data, b_list))
- b_list = list(map(lambda x: str(x) + "\n", b_list))
- a.extend(b_list)
- elif str(line).startswith("<ul"):
- a.append(line.text() + "\n")
- elif line.attr("class") == "slave-datas":
- for j, ss in enumerate(line.children().items()):
- if ss(".read-list-title").text():
- a.append(str(j + 1) + "." + ss(".read-list-title").text() + "\n")
- if ss(".read-list-opt").text():
- a.append(ss(".read-list-opt").text().replace("\n", "\t") + "\n")
- if str(ss).startswith("<ul"):
- a.append(ss.text() + "\n")
- elif str(line).startswith("<table"):
- a.append(str(line) + "\n")
- elif str(line).startswith("<ol"):
- for i, ss in enumerate(line.children().items()):
- a.append(str(i + 1) + "." + ss.text() + "\n")
- else:
- if line.text().strip():
- line = replace_k(line)
- a.append(line.text().replace("+", "\xa0") + "\n")
- else:
- if html.text().strip():
- a.append(html.text() + "\n")
- new_a = list(filter(lambda x: x.strip(), a))
- # if parm:
- # new_a[0] = "\xa0" * 4 + new_a[0]
- return filter_word(new_a)
- if __name__ == '__main__':
- cons = '''<p> Amelia Earhart was <font style="color: red">borm</font> in 1897 in Kansas She and her younger sister Muriel were very active. They</p><p style="height: 0;"> </p> rode horses, played baseball and basketball.<p style="height: 0;"> </p> In 1916 Amelia completed high school and then prepared to enter<img src="http://zsytk2.zhixinhuixue.com/static/images/1.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/1.gif" class="tiankong" data-num="0">university.<p style="height: 0;"> </p> During a holiday, she visited<img src="http://zsytk2.zhixinhuixue.com/static/images/2.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/2.gif" class="tiankong" data-num="1">(she)sister in Toronto<font style="color: blue">,Canada</font>. World War One<img src="http://zsytk2.zhixinhuixue.com/static/images/3.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/3.gif" class="tiankong" data-num="2">(begin)by then.<p style="height: 0;"> </p> And Amelia<img src="http://zsytk2.zhixinhuixue.com/static/images/4.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/4.gif" class="tiankong" data-num="3">(shock)by the number of wounded soldiers<font style="color: blue">,_</font><img src="http://zsytk2.zhixinhuixue.com/static/images/5.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/5.gif" class="tiankong" data-num="4">were sent home from the fighting in<p style="height: 0;"> </p> France. She decided that she would be more<img src="http://zsytk2.zhixinhuixue.com/static/images/6.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/6.gif" class="tiankong" data-num="5">(use)as a nurse than as a student. So she joined the Red<p style="height: 0;"> </p> Cross.<p style="height: 0;"> </p> Amelia Earhart first became interested in flying when<img src="http://zsytk2.zhixinhuixue.com/static/images/7.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/7.gif" class="tiankong" data-num="6">(live)in Toronto So she wanted to learn to<p style="height: 0;"> </p> fly. Then she<img src="http://zsytk2.zhixinhuixue.com/static/images/8.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/8.gif" class="tiankong" data-num="7">(receive)her official pilot's license. On May <font style="color: red">20th</font>, 1932,Amelia took<img src="http://zsytk2.zhixinhuixue.com/static/images/9.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/9.gif" class="tiankong" data-num="8">from<p style="height: 0;"> </p> Newfoundland in a small red and gold plane. And after fifteen hours she landed in Ireland. She became the<p style="height: 0;"> </p> first woman<img src="http://zsytk2.zhixinhuixue.com/static/images/10.gif" _src="http://zsytk2.zhixinhuixue.com/static/images/10.gif" class="tiankong" data-num="9">(<font style="color: red">ly</font>)across the Atlantic Ocean alone. <font style="color: blue"> </font></p>
- <p>【答案】<span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">1.</span><span data-v-73bdf652="" class="dib">a</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">2.</span><span data-v-73bdf652="" class="dib">her</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">3.</span><span data-v-73bdf652="" class="dib">had begun</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">4.</span><span data-v-73bdf652="" class="dib">was shocked</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">5.</span><span data-v-73bdf652="" class="dib">who</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">6.</span><span data-v-73bdf652="" class="dib">useful</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">7.</span><span data-v-73bdf652="" class="dib">living</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">8.</span><span data-v-73bdf652="" class="dib">received</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">9.</span><span data-v-73bdf652="" class="dib">off</span></span><span data-v-73bdf652="" class="answer-opts"><span data-v-73bdf652="" class="dib">10.</span><span data-v-73bdf652="" class="dib">to fly</span></span></p>
- <p>【解析】<p>asdasdasd</p></p>'''
- # pprint(cons)
- # print(again_parse(cons))
- pprint(again_parse(cons))
|