# -*- coding: utf-8 -*-
import re
from pprint import pprint
from pyquery import PyQuery as pq
#
# pattern = re.compile(r"\[来源.*?\]|www\..*?com")
#
# filter_words = ["学科网", "高考资源网", "Ziyuanku.com", "WWW.ziyuanku.com",
# "高☆考♂资♀源€网", "w.w.w.k.s.5.u.c.o.m", "本资料由《七彩教育网》www.7caiedu.cn 提供!",
# "本资料来源于《七彩教育网》http://www.7caiedu.cn", "本资料由《七彩教育网》www.7caiedu.cn 提",
# "高考试题来源:"]
#
#
# def filter_word(txt_list):
# new_txt_list = []
# for word in txt_list:
# if not word.strip():
# continue
# new_word = re.sub(pattern, "", word)
# for keys in filter_words:
# if keys in new_word:
# new_word = new_word.replace(keys, "")
# new_txt_list.append(new_word)
# return new_txt_list
def filter_data(x):
if not str(x).replace(" ", "").strip():
pass
else:
return str(x)
def replace_k(con):
# con = str(con).replace(" ", "+")
# con = str(con).replace(" ", "+")
con = re.sub(r'\s(?!(src="http|_src="http|class="tiankong"|data-num=))', "#+#", str(con))
return pq(con, parser="html")
def css_label_wash(content):
# todo add 9-4
"""
清洗文本中带有的css标签
:param content:
:return:
"""
# temp_con = re.sub('?p(\s*|\s+style=.*?")?>', "", str(content))
if re.search('?(span|font|article|ul|ol|div)(\s*|\s+style=.*?")>|text\s*-\s*decoration: underline|border\s*-\s*bottom:', str(content)) is None:
# content = re.sub('
', "\n", content).strip().replace("\n\n", "\n").replace("\n", "
")
def subp(s):
cons = s.group(1)
if '"
content = re.sub('(
)', subp, content)
content = re.sub('
', "
\n", content)
return content
else:
content = re.sub(r'', "", str(content))
content = re.sub(r'', "", str(content))
content = re.sub(r'', "", str(content))
content = re.sub(r'', "", str(content))
content = content.replace("", "").replace("", "")
# content = content.replace("", "").replace("", "")
content = content.replace('
', "\n").replace('
', "\n")
content = content.replace("", "\n")
# content = re.sub('', "", content)
# content = re.sub('
', "\n", content)
# parm = False
# if "" not in str(content):
# parm = True
subs2img = {}
if re.search('" in content:
all_imgs = re.findall('|.*?',
content, flags=re.S)
for k, img in enumerate(all_imgs):
content = content.replace(img, "&{}&".format(k))
subs2img["&{}&".format(k)] = img
content = re.sub(r"<(su[bp])>(.*?)(su[bp])>", r"【\1】\2【/\3】", content)
html = pq(content, parser="html")
a = []
if html.children():
# temph = [str(i) for i in html.children().items()]
for line in html.children().items(): # .*?
里面的内容可能会被过滤掉
test = str(line) # line.text()
# 保留下划线及着重符标签
# 波浪线:
# pq会将多个空格换成一个
# print(str(line))
if '.+?)', r"【1#\1##】", str(line))
line = re.sub(r'(.+?)', r"【2#\1##】", str(line))
line = re.sub(r'<(p style="text-(indent|align):.*?">.+?)
', r"【\1##3】", str(line))
line = re.sub(r'.+?)
',
r"【div\2\5##3】", str(line), flags=re.S)
# line = re.sub(r'<(strong|em)>(.+?)\1>', r"【\1##\2##\1】", str(line))
# 分开处理会比较好,嵌套的格式也能全处理
line = re.sub(r'(.+?)', r"【strong##\1##strong】", str(line))
line = re.sub(r'(.+?)', r"【em##\1##em】", str(line))
# print(line)
line = line.replace(" ", "【+】")
line = pq(line)
new_line = list(map(lambda x: str(x).replace("【1#", '").replace("【2#", '')
.replace("【p【+】style=", "").replace("【strong##", "").replace("【em##", "")
.replace("##strong】", "").replace("##em】", "").replace("【+】", " "),
line.text().split("\n")))
a.extend(new_line)
elif str(line).startswith("
", "\n").replace("
", "\n").replace("
", "\n") + "\n")
a.append(line.html())
else:
line = replace_k(line)
if re.search(r"\n|
", str(line), flags=re.S):
line = re.sub(r"\n|
", "#*#", str(line), flags=re.S)
line = pq(line)
new_line = list(map(lambda x: str(x).replace("#+#", " "), line.text().split("#*#")))
a.extend(new_line)
else:
if line.text().strip():
a.append(line.text().replace("#+#", " "))
elif "" in str(line) and "" in str(line):
line = re.sub(r'', "", str(line.html()))
b = line.replace('', ""). \
replace("
", "\n"). \
replace("
", "\n"). \
replace("
", "\n"). \
replace('
', "\n"). \
replace('
', "\n")
b_list = b.split("\n")
# b_list = list(filter(lambda x: str(x), b_list))
b_list = list(filter(filter_data, b_list))
b_list = list(map(lambda x: str(x), b_list))
a.extend(b_list)
elif str(line).startswith("") or str(line).startswith("
"):
a.append(str(line))
else:
# print('test:',line.text()) # 自动去掉了图片
if line.text().strip():
line = replace_k(line)
# a.append(line.text().replace("+", "\xa0") + "\n")
a.append(line.text().replace("#+#", " "))
else:
# if html.text().strip(): 把换行\n 都去掉了
# a.append(html.text())
# a.append(str(html)) # 会自动带上?p> ;&变为&;
a.append(content.strip())
new_a = "\n".join(list(map(lambda x: x.strip(), a)))
if subs2img:
new_a = re.sub("|".join(subs2img.keys()), lambda x: subs2img[x.group()], new_a)
# new_a = "" + new_a.replace("\n\n", "\n").replace("\n", "
\n") + "
"
new_a = re.sub("(?)\s*\n", "
", new_a.replace("\n\n", "\n")) # 2024.6.13
new_a = re.sub("
(\n|
)+", "
", new_a)
new_a = "" + new_a + "
"
new_a = re.sub(r'(
$', r"\1", new_a, flags=re.S)
# for sb, img in subs2img.items(): # 2021
# new_a = new_a.replace(sb, img)
# if parm:
# new_a[0] = "\xa0" * 4 + new_a[0]
new_a = re.sub(r"【(/?su[bp])】", r"<\1>", new_a)
return new_a
def again_parse(content):
# todo add 9-4
content = re.sub(r'', "", str(content))
content = re.sub(r'', "", str(content))
content = re.sub(r'', "", str(content))
content = re.sub(r'', "", str(content))
content = str(content).replace("", "").replace("", "")
content = str(content).replace("
", "").replace("", "")
content = str(content).replace('
', "\n").replace('
', "\n")
# parm = False
# if "" not in str(content):
# parm = True
html = pq(content, parser="html")
a = []
if html.children():
for line in html.children().items():
if (str(line).startswith("", "\n").replace("
", "\n").replace("
", "\n") + "\n")
else:
line = replace_k(line)
if "
" in str(line) or "
" in str(line) or "
" in str(line):
line = str(line).replace("
", "###").replace("
", "###").replace("
", "###")
line = pq(line)
new_line = list(map(lambda x: str(x).replace("+", "\xa0") + "\n", line.text().split("###")))
a.extend(new_line)
else:
if line.text().strip():
a.append(line.text().replace("+", "\xa0") + "\n")
elif "" in str(line) and "" in str(line):
line = re.sub(r'', "", str(line.html()))
b = line.replace('', ""). \
replace("
", "\n"). \
replace("
", "\n"). \
replace("
", "\n"). \
replace('
', "\n"). \
replace('
', "\n")
b_list = b.split("\n")
# b_list = list(filter(lambda x: str(x), b_list))
b_list = list(filter(filter_data, b_list))
b_list = list(map(lambda x: str(x) + "\n", b_list))
a.extend(b_list)
elif str(line).startswith("下述有关功和能量说法正确的是( )
- A.物体做功越多,物体的能量就越大
- B.摩擦力可能对物体做正功,也可能做负功,也可以不做功
- C.能量耗散表明,能量守恒定律有些情况下并不成立
- D.弹簧拉伸时的弹性势能一定大于压缩时的弹性势能
【答案】B
【解析】功是能量转化的量度,物体做功越多,物体的能量转化就越多,而不是能量越大。故A错误;摩擦力方向可能与物体运动方向相同、也与物体运动方向相反,所以摩擦力可能对物体做正功,也可能做负功。物体也可能没有位移,摩擦力不做功,故B正确;能量耗散虽然不会使能的总量减少,但能量的可利用率越来越低,即能量的品质越来越低;根据能量守恒定律可知,虽然能量的可利用率越来越低,但能量总和保持不变,仍然遵守能量守恒定律,故C错误;弹簧的弹性势能与形变量有关,弹簧拉伸时与压缩时弹性势能可能相等,也可能拉伸时的弹性势能小于压缩时的弹性势能。故D错误。故选B。
'''
# cons = '''Unbelievable!Oh..., _____ you don't mind, I'll stop and take a deep breath.
# 【答案】
1.if。考查if引导的条件状语从句。根据句意可知。
# 【解析】
'''
# pprint(cons)
# print(again_parse(cons))
# print(again_parse(cons))
# print(list(map(lambda x: str(x).replace(" ", " "), again_parse(cons))))
# con1 = r'解:A.研究跨栏动作时,刘翔的大小和形状不能忽略,不能看作质点,故A错误;
B.选取不同的参考系,物体的运动状态是不相同的,故B错误;
C.出租车收费是按路程收费的,故C错误;
D.第是指的时间,是指从末到末这一段时间,故D正确;
故选:D.
'
cons1 = """
阅读下面这首宋诗,完成下列小题。(本题共2小题,9分)
除夜野宿常州城外二首(其二)
苏轼
南来三见岁云徂①,直恐终身走道途。
老去怕看新历日,退归拟学旧桃符。
烟花已作青春②意,霜雪偏寻病客须。
但把穷愁博长健,不辞最后饮屠苏③。
【注】①苏轼于熙宁四年(1071)冬到杭州任通判,至作此诗,已度过三个除夕。岁云徂,谓年岁辞去。徂,往。②青春:春季。③古俗,正月初一家人先幼后长依次饮屠苏酒。《时镜新书》晋董勋云:“正旦饮酒先从小者,何也?勋曰:‘俗以小者得岁,故先酒贺之,老者失时,故后饮酒。’”
14.下列对这首诗的理解和赏析,不正确的一项是( )
- A.诗人离开朝廷南来已三年,恐怕自己终身奔走于宦途而不能践偿其政治抱负。
- B.因桃符一年一换,诗人自比“桃符”,寄托了自己在新的一年仕途晋升的愿望。
- C.颈联对仗工整,诗人以自然界万物复苏的繁丽景象来反衬出自己的衰病老迈。
- D.本诗抒发了除夜感慨,“桃符”“烟花”“屠苏”等意象照应节令,节日氛围浓厚。
15.末句“不辞最后饮屠苏”意蕴丰富,请简要分析。
【答案】
14. B
15. ①饮屠苏酒的顺序是自少至老,诗人说“最后饮”,表明了他年事已高。②诗人“辞”掉的是富贵荣华,表明他不再以仕途不畅为意。③“不辞最后饮”又表明他不以岁月流逝为意,以豁达乐观待之,表现了诗人豪放旷达的情怀。
【解析】
14. 本题考查学生鉴赏诗歌的形象、表达技巧和情感的能力。B.“寄托了自己在新的一年仕途晋升的愿望”曲解文意。由“退归”可知,本句暗指诗人要抛却不如意的仕途,含有退隐之意。故选B。
15. 本题考查学生理解诗句意蕴的能力。诗人一扫前面的郁闷,表示要用“穷”和“愁”换取长久的健康,要屠苏酒来迎新年。正月初一饮屠苏酒是一种习俗,饮用的顺序是自少至老,诗人说“最后饮”,表明了他年事已高。“烟花”二句,以自然界万物复苏的繁丽景象,反衬自己的衰病老迈。“拟学旧桃符”暗指诗人要抛却不如意的仕途,诗人“辞”掉的是富贵荣华,表明他不再以仕途不畅为意。诗人说“不辞最后饮”,不怕轮到我最后一个把屠苏酒饮,表明他不以岁月流逝为意,以豁达乐观待之,以此观照开篇,更见苏轼豪放旷达的情怀。
"""
conss = css_label_wash(cons1)
print(conss)