# -*- coding: utf-8 -*- import re from pprint import pprint from pyquery import PyQuery as pq pattern = re.compile(r"\[来源.*?\]") filter_words = ["学科网", "高考资源网", "Ziyuanku.com", "WWW.ziyuanku.com", "高☆考♂资♀源€网", "w.w.w.k.s.5.u.c.o.m", "本资料由《七彩教育网》www.7caiedu.cn 提供!", "本资料来源于《七彩教育网》http://www.7caiedu.cn", "本资料由《七彩教育网》www.7caiedu.cn 提", "高考试题来源:"] def filter_word(txt_list): new_txt_list = [] for word in txt_list: if not word.strip(): continue new_word = re.sub(pattern, "", word) for keys in filter_words: if keys in new_word: new_word = new_word.replace(keys, "") new_txt_list.append(new_word) return new_txt_list def filter_data(x): if not str(x).replace(" ","").strip(): pass else: return str(x) def again_parse2(content): # todo add 9-4 content = re.sub(r'', "", str(content)) content = re.sub(r'', "", str(content)) content = str(content).replace("", "") content = str(content).replace('

 

', "\n") parm = False if "
" not in str(content): parm = True html = pq(content, parser="html") a = [] if html.children(): for line in html.children().items(): if str(line).startswith("", "\n").replace("
", "\n").replace("
", "\n") + "\n") else: if "
" in str(line) or "
" in str(line) or "
" in str(line): line = str(line).replace("
", "###").replace("
", "###").replace("
", "###") line = pq(line) new_line = list(map(lambda x: str(x) + "\n", line.text().split("###"))) a.extend(new_line) else: if line.text().strip(): a.append(line.text() + "\n") elif str(line).startswith("
"): line = re.sub(r'', "", str(line.html())) b = line.replace('

', ""). \ replace("
", "\n"). \ replace("
", "\n"). \ replace("
", "\n"). \ replace('

 

', "\n").\ replace('

', "\n") b_list = b.split("\n") # b_list = list(filter(lambda x: str(x), b_list)) b_list = list(filter(filter_data, b_list)) b_list = list(map(lambda x: str(x) + "\n", b_list)) a.extend(b_list) elif str(line).startswith("', "", str(content)) content = re.sub(r'', "", str(content)) content = str(content).replace("", "") content = str(content).replace('

 

', "\n") parm = False if "
" not in str(content): parm = True html = pq(content, parser="html") a = [] if html.children(): for line in html.children().items(): if str(line).startswith("", "\n").replace("
", "\n").replace("
", "\n") + "\n") else: line = replace_k(line) if "
" in str(line) or "
" in str(line) or "
" in str(line): line = str(line).replace("
", "###").replace("
", "###").replace("
", "###") line = pq(line) new_line = list(map(lambda x: str(x).replace("+", "\xa0") + "\n", line.text().split("###"))) a.extend(new_line) else: if line.text().strip(): a.append(line.text().replace("+", "\xa0") + "\n") elif str(line).startswith("
"): line = re.sub(r'', "", str(line.html())) b = line.replace('

', ""). \ replace("
", "\n"). \ replace("
", "\n"). \ replace("
", "\n"). \ replace('

 

', "\n"). \ replace('

', "\n") b_list = b.split("\n") # b_list = list(filter(lambda x: str(x), b_list)) b_list = list(filter(filter_data, b_list)) b_list = list(map(lambda x: str(x) + "\n", b_list)) a.extend(b_list) elif str(line).startswith("        Amelia Earhart was borm in 1897 in Kansas She and her younger sister Muriel were very active. They

 

   rode horses, played baseball and basketball.

 

     In 1916 Amelia completed high school and then prepared to enteruniversity.

 

   During a holiday, she visited(she)sister in Toronto,Canada. World War One(begin)by then.

 

   And Amelia(shock)by the number of wounded soldiers,_were sent home from the fighting in

 

   France. She decided that she would be more(use)as a nurse than as a student. So she joined the Red

 

   Cross.

 

     Amelia Earhart first became interested in flying when(live)in Toronto So she wanted to learn to

 

   fly. Then she(receive)her official pilot's license. On May 20th, 1932,Amelia tookfrom

 

   Newfoundland in a small red and gold plane. And after fifteen hours she landed in Ireland. She became the

 

   first woman(ly)across the Atlantic Ocean alone.

【答案】1.a2.her3.had begun4.was shocked5.who6.useful7.living8.received9.off10.to fly

【解析】

asdasdasd

''' # pprint(cons) # print(again_parse(cons)) pprint(again_parse(cons))