# -*- coding: utf-8 -*- import re from pprint import pprint from pyquery import PyQuery as pq # # pattern = re.compile(r"\[来源.*?\]|www\..*?com") # # filter_words = ["学科网", "高考资源网", "Ziyuanku.com", "WWW.ziyuanku.com", # "高☆考♂资♀源€网", "w.w.w.k.s.5.u.c.o.m", "本资料由《七彩教育网》www.7caiedu.cn 提供!", # "本资料来源于《七彩教育网》http://www.7caiedu.cn", "本资料由《七彩教育网》www.7caiedu.cn 提", # "高考试题来源:"] # # # def filter_word(txt_list): # new_txt_list = [] # for word in txt_list: # if not word.strip(): # continue # new_word = re.sub(pattern, "", word) # for keys in filter_words: # if keys in new_word: # new_word = new_word.replace(keys, "") # new_txt_list.append(new_word) # return new_txt_list def filter_data(x): if not str(x).replace(" ", "").strip(): pass else: return str(x) def replace_k(con): # con = str(con).replace(" ", "+") # con = str(con).replace(" ", "+") con = re.sub(r'\s(?!(src="http|_src="http|class="tiankong"|data-num=))', "#+#", str(con)) return pq(con, parser="html") def css_label_wash(content): # todo add 9-4 """ 清洗文本中带有的css标签 :param content: :return: """ # temp_con = re.sub('', "", str(content)) if re.search('|text\s*-\s*decoration: underline|border\s*-\s*bottom:', str(content)) is None: # content = re.sub('', "\n", content).strip().replace("\n\n", "\n").replace("\n", "
"). content = re.sub('', "

", content) content = re.sub('

', "

\n

", content) return content else: content = re.sub(r'', "", str(content)) # content = re.sub(r'', "", str(content)) #20240704/红色标记先保留 content = re.sub(r'(.+?)', r"【red##\1】", str(content)) content = re.sub(r'', "", str(content)) content = re.sub(r'', "", str(content)) content = content.replace("", "").replace("", "") # content = content.replace("", "").replace("", "") content = content.replace('

 

', "\n").replace('


', "\n") # content = content.replace("

", "

\n") # 2022-4-25 content = re.sub(r"

\s*(?!\n)", "

\n", content) content = re.sub(r"", "

", content) # 2022/1/6 # content = re.sub('', "", content) # content = re.sub('', "\n", content) # parm = False # if "

" not in str(content): # parm = True subs2img = {} if re.search('" in content: all_imgs = re.findall('|.*?', content, flags=re.S) for k, img in enumerate(all_imgs): content = content.replace(img, "&{}&".format(k)) #表格里的公式的标签需要清洗20240704 if "" in img and "math-tex" in img: img = re.sub(r'(.+?)', r'\1', img) subs2img["&{}&".format(k)] = img content = re.sub(r"<(su[bp])>(.*?)", r"【\1】\2【/\3】", content) content = content.replace("<", "【#lt;】") html = pq(content, parser="html") a = [] if html.children(): for line in html.children().items(): #

.*?

里面的内容可能会被过滤掉 test = line.text() # 保留下划线及着重符标签 # 波浪线: # pq会将多个空格换成一个 if '.+?)', r"【1#\1##】", str(line)) line = re.sub(r'(.+?)', r"【2#\1##】", str(line)) line = re.sub(r'<(p style="text-(indent|align):.*?">.+?)

', r"【\1##3】", str(line)) line = line.replace(" ", "【+】") line = pq(line) new_line = list(map(lambda x: str(x).replace("【1#", '").replace("【2#", '') .replace("【p【+】style=", "

").replace("【+】", " "), line.text().split("\n"))) a.extend(new_line) elif str(line).startswith("", "\n").replace("
", "\n").replace("
", "\n") + "\n") a.append(line.html()) else: line = replace_k(line) if re.search(r"\n|", str(line), flags=re.S): line = re.sub(r"\n|", "#*#", str(line), flags=re.S) line = pq(line) new_line = list(map(lambda x: str(x).replace("#+#", " "), line.text().split("#*#"))) a.extend(new_line) else: if line.text().strip(): a.append(line.text().replace("#+#", " ")) elif "

" in str(line) and "
" in str(line): line = re.sub(r'', "", str(line.html())) b = line.replace('

', ""). \ replace("
", "\n"). \ replace("
", "\n"). \ replace("
", "\n"). \ replace('

 

', "\n"). \ replace('

', "\n") b_list = b.split("\n") # b_list = list(filter(lambda x: str(x), b_list)) b_list = list(filter(filter_data, b_list)) b_list = list(map(lambda x: str(x), b_list)) a.extend(b_list) elif str(line).startswith(" ;&变为&; a.append(content.strip()) new_a = "\n".join(list(map(lambda x: str(x).strip(), a))) new_a = re.sub("(\n\s*)+", "\n", new_a) # print("newa:::", new_a) if subs2img: new_a = re.sub(r"|".join(subs2img.keys()), lambda x: subs2img[x.group()], new_a) new_a = "

" + new_a.replace("\n\n", "\n").replace("\n", "

\n

") + "

" #第2个replace:“\n”前加r # for sb, img in subs2img.items(): # 2021 # new_a = new_a.replace(sb, img) # if parm: # new_a[0] = "\xa0" * 4 + new_a[0] new_a = re.sub(r"【(/?su[bp])】", r"<\1>", new_a).replace("【#lt;】", "<") new_a = re.sub(r"【red##(.*?)】", r'\1', new_a) return new_a if __name__ == '__main__': cons = r'''
(1)求货物和重物的质量关系;
(2)要使货物运送到B端,若采用在重物下方挖坑的方法,求至少挖多深的坑;
(3)要使货物运送到B端,若采用配重落地时传送带立刻顺时针转动的方法(启动时间可
忽略),求传送带速度大小的范围和货物从A端传送到B端所用时间的范围(结果保留三位有
效数字)。



物理参考答案
1.C
【解析】绝大多数α粒子沿直线穿过,偏转角很
小,说明原子核很小;A项错误;少数α粒子穿过金箱
后发生较犬角度的偏转是由于少数α粒子穿过金箱
时距离金原子核较近,受到的库仑斥力较大,B项错
误;极少数α粒子被弹回,说明原子核是一个体积小、
''' cons1 = ''' 9 . 中国古代的政治权力由“传贤”转变为“传子”,“家天下”制度开始形成于
A.夏朝B.商朝C.周朝D.秦朝
''' # pprint(cons) # print(again_parse(cons)) # print(again_parse(cons)) # print(list(map(lambda x: str(x).replace(" ", " "), again_parse(cons)))) # con1 = r'

解:A.研究跨栏动作时,刘翔的大小和形状不能忽略,不能看作质点,故A错误;
B.选取不同的参考系,物体的运动状态是不相同的,故B错误;
C.出租车收费是按路程收费的,故C错误;
D.第是指的时间,是指从末到末这一段时间,故D正确;
故选:D.

' path2 = r"F:\zwj\Text_Structure\accept_files\667d0bec1f8a0743e2aabc78_2.html" html = open(path2, "r", encoding="utf-8").read() cons = css_label_wash(html) with open(r"F:\zwj\Text_Structure\accept_files\temp.txt", "w",encoding='utf-8') as f: f.write(cons) print(cons)