# -*- coding: utf-8 -*-
import re
from pprint import pprint
from pyquery import PyQuery as pq
pattern = re.compile(r"\[来源.*?\]")
filter_words = ["学科网", "高考资源网", "Ziyuanku.com", "WWW.ziyuanku.com",
"高☆考♂资♀源€网", "w.w.w.k.s.5.u.c.o.m", "本资料由《七彩教育网》www.7caiedu.cn 提供!",
"本资料来源于《七彩教育网》http://www.7caiedu.cn", "本资料由《七彩教育网》www.7caiedu.cn 提",
"高考试题来源:"]
def filter_word(txt_list):
new_txt_list = []
for word in txt_list:
if not word.strip():
continue
new_word = re.sub(pattern, "", word)
for keys in filter_words:
if keys in new_word:
new_word = new_word.replace(keys, "")
new_txt_list.append(new_word)
return new_txt_list
def filter_data(x):
if not str(x).replace(" ","").strip():
pass
else:
return str(x)
def again_parse2(content):
# todo add 9-4
content = re.sub(r'', "", str(content))
content = re.sub(r'', "", str(content))
content = str(content).replace("", "")
content = str(content).replace('
', "\n")
parm = False
if "" not in str(content):
parm = True
html = pq(content, parser="html")
a = []
if html.children():
for line in html.children().items():
if str(line).startswith("", "\n").replace("
", "\n").replace("
", "\n") + "\n")
else:
if "
" in str(line) or "
" in str(line) or "
" in str(line):
line = str(line).replace("
", "###").replace("
", "###").replace("
", "###")
line = pq(line)
new_line = list(map(lambda x: str(x) + "\n", line.text().split("###")))
a.extend(new_line)
else:
if line.text().strip():
a.append(line.text() + "\n")
elif str(line).startswith(""):
line = re.sub(r'', "", str(line.html()))
b = line.replace('
', ""). \
replace("
", "\n"). \
replace("
", "\n"). \
replace("
", "\n"). \
replace('
', "\n").\
replace('
', "\n")
b_list = b.split("\n")
# b_list = list(filter(lambda x: str(x), b_list))
b_list = list(filter(filter_data, b_list))
b_list = list(map(lambda x: str(x) + "\n", b_list))
a.extend(b_list)
elif str(line).startswith("', "", str(content))
content = re.sub(r'', "", str(content))
content = str(content).replace("", "")
content = str(content).replace('
', "\n")
parm = False
if "" not in str(content):
parm = True
html = pq(content, parser="html")
a = []
if html.children():
for line in html.children().items():
if str(line).startswith("", "\n").replace("
", "\n").replace("
", "\n") + "\n")
else:
line = replace_k(line)
if "
" in str(line) or "
" in str(line) or "
" in str(line):
line = str(line).replace("
", "###").replace("
", "###").replace("
", "###")
line = pq(line)
new_line = list(map(lambda x: str(x).replace("+", "\xa0") + "\n", line.text().split("###")))
a.extend(new_line)
else:
if line.text().strip():
a.append(line.text().replace("+", "\xa0") + "\n")
elif str(line).startswith(""):
line = re.sub(r'', "", str(line.html()))
b = line.replace('', ""). \
replace("
", "\n"). \
replace("
", "\n"). \
replace("
", "\n"). \
replace('
', "\n"). \
replace('
', "\n")
b_list = b.split("\n")
# b_list = list(filter(lambda x: str(x), b_list))
b_list = list(filter(filter_data, b_list))
b_list = list(map(lambda x: str(x) + "\n", b_list))
a.extend(b_list)
elif str(line).startswith("