# -*- coding: utf-8 -*-
import re
from pprint import pprint
from pyquery import PyQuery as pq
#
# pattern = re.compile(r"\[来源.*?\]|www\..*?com")
#
# filter_words = ["学科网", "高考资源网", "Ziyuanku.com", "WWW.ziyuanku.com",
# "高☆考♂资♀源€网", "w.w.w.k.s.5.u.c.o.m", "本资料由《七彩教育网》www.7caiedu.cn 提供!",
# "本资料来源于《七彩教育网》http://www.7caiedu.cn", "本资料由《七彩教育网》www.7caiedu.cn 提",
# "高考试题来源:"]
#
#
# def filter_word(txt_list):
# new_txt_list = []
# for word in txt_list:
# if not word.strip():
# continue
# new_word = re.sub(pattern, "", word)
# for keys in filter_words:
# if keys in new_word:
# new_word = new_word.replace(keys, "")
# new_txt_list.append(new_word)
# return new_txt_list
def filter_data(x):
if not str(x).replace(" ", "").strip():
pass
else:
return str(x)
def replace_k(con):
# con = str(con).replace(" ", "+")
# con = str(con).replace(" ", "+")
con = re.sub(r'\s(?!(src="http|_src="http|class="tiankong"|data-num=))', "+", str(con))
return pq(con, parser="html")
def again_parse(content):
# todo add 9-4
content = re.sub(r'', "", str(content))
content = re.sub(r'', "", str(content))
content = re.sub(r'', "", str(content))
content = re.sub(r'', "", str(content))
content = str(content).replace("", "").replace("", "")
content = str(content).replace("", "").replace("", "")
content = str(content).replace('
', "\n").replace('
', "\n")
# parm = False
# if "" not in str(content):
# parm = True
html = pq(content, parser="html")
a = []
if html.children():
for line in html.children().items():
if str(line).startswith("", "\n").replace("
", "\n").replace("
", "\n") + "\n")
else:
line = replace_k(line)
if "
" in str(line) or "
" in str(line) or "
" in str(line):
line = str(line).replace("
", "###").replace("
", "###").replace("
", "###")
line = pq(line)
new_line = list(map(lambda x: str(x).replace("+", "\xa0") + "\n", line.text().split("###")))
a.extend(new_line)
else:
if line.text().strip():
a.append(line.text().replace("+", "\xa0") + "\n")
elif "" in str(line) and "" in str(line):
line = re.sub(r'', "", str(line.html()))
b = line.replace('', ""). \
replace("
", "\n"). \
replace("
", "\n"). \
replace("
", "\n"). \
replace('
', "\n"). \
replace('
', "\n")
b_list = b.split("\n")
# b_list = list(filter(lambda x: str(x), b_list))
b_list = list(filter(filter_data, b_list))
b_list = list(map(lambda x: str(x) + "\n", b_list))
a.extend(b_list)
elif str(line).startswith("____ prize for the winner of the competition is ____ two-week holiday in Paris.A.The; /
B.A; /
C.A; the
D.The; a
【答案】D
【解析】考查冠词,第一个空格表示对比赛获胜者的奖励,是特指,故用the,第二个空格泛指一次为期两周的假期,用不定冠词a,故选D项。句意,这次比赛给获胜者的奖励是在巴黎度假两周。
'''
# cons = '''Unbelievable!Oh..., _____ you don't mind, I'll stop and take a deep breath.
# 【答案】
1.if。考查if引导的条件状语从句。根据句意可知。
# 【解析】
'''
# pprint(cons)
# print(again_parse(cons))
# print(again_parse(cons))
# 2021_04_14_09_38_55
path2 = r"F:\zwj\Text_Structure\fail_files\2021_04_14_09_38_55.html"
html = open(path2, "r", encoding="utf-8").read()
# print(html)
print(list(map(lambda x: str(x).replace(" ", " "), again_parse(html))))