# -*- coding: utf-8 -*- import re from pprint import pprint from pyquery import PyQuery as pq # # pattern = re.compile(r"\[来源.*?\]|www\..*?com") # # filter_words = ["学科网", "高考资源网", "Ziyuanku.com", "WWW.ziyuanku.com", # "高☆考♂资♀源€网", "w.w.w.k.s.5.u.c.o.m", "本资料由《七彩教育网》www.7caiedu.cn 提供!", # "本资料来源于《七彩教育网》http://www.7caiedu.cn", "本资料由《七彩教育网》www.7caiedu.cn 提", # "高考试题来源:"] # # # def filter_word(txt_list): # new_txt_list = [] # for word in txt_list: # if not word.strip(): # continue # new_word = re.sub(pattern, "", word) # for keys in filter_words: # if keys in new_word: # new_word = new_word.replace(keys, "") # new_txt_list.append(new_word) # return new_txt_list def filter_data(x): if not str(x).replace(" ", "").strip(): pass else: return str(x) def replace_k(con): # con = str(con).replace(" ", "+") # con = str(con).replace(" ", "+") con = re.sub(r'\s(?!(src="http|_src="http|class="tiankong"|data-num=))', "+", str(con)) return pq(con, parser="html") def again_parse(content): # todo add 9-4 content = re.sub(r'', "", str(content)) content = re.sub(r'', "", str(content)) content = re.sub(r'', "", str(content)) content = re.sub(r'', "", str(content)) content = str(content).replace("", "").replace("", "") content = str(content).replace("", "").replace("", "") content = str(content).replace('

 

', "\n").replace('


', "\n") # parm = False # if "
" not in str(content): # parm = True html = pq(content, parser="html") a = [] if html.children(): for line in html.children().items(): if str(line).startswith("", "\n").replace("
", "\n").replace("
", "\n") + "\n") else: line = replace_k(line) if "
" in str(line) or "
" in str(line) or "
" in str(line): line = str(line).replace("
", "###").replace("
", "###").replace("
", "###") line = pq(line) new_line = list(map(lambda x: str(x).replace("+", "\xa0") + "\n", line.text().split("###"))) a.extend(new_line) else: if line.text().strip(): a.append(line.text().replace("+", "\xa0") + "\n") elif "
" in str(line) and "
" in str(line): line = re.sub(r'', "", str(line.html())) b = line.replace('

', ""). \ replace("
", "\n"). \ replace("
", "\n"). \ replace("
", "\n"). \ replace('

 

', "\n"). \ replace('

', "\n") b_list = b.split("\n") # b_list = list(filter(lambda x: str(x), b_list)) b_list = list(filter(filter_data, b_list)) b_list = list(map(lambda x: str(x) + "\n", b_list)) a.extend(b_list) elif str(line).startswith("____ prize for the winner of the competition is ____ two-week holiday in Paris.

  • A.The; /

  • B.A; /

  • C.A; the

  • D.The; a

【答案】D

【解析】考查冠词,第一个空格表示对比赛获胜者的奖励,是特指,故用the,第二个空格泛指一次为期两周的假期,用不定冠词a,故选D项。句意,这次比赛给获胜者的奖励是在巴黎度假两周。

''' # cons = '''

UnbelievableOh..., _____ you don't mind, I'll stop and take a deep breath.


#

【答案】

1.if。考查if引导的条件状语从句。根据句意可知。


#

【解析】

''' # pprint(cons) # print(again_parse(cons)) # print(again_parse(cons)) # 2021_04_14_09_38_55 path2 = r"F:\zwj\Text_Structure\fail_files\2021_04_14_09_38_55.html" html = open(path2, "r", encoding="utf-8").read() # print(html) print(list(map(lambda x: str(x).replace(" ", " "), again_parse(html))))