#!/usr/bin/env/python # -*- coding:utf-8 -*- import time import requests import json import pandas as pd import my_config import pymongo sb_name = {"物理": "phy", "化学": "chem", "生物": "biology", "政治": "politics", "历史": "history", "地理": "geography", "数学": "math", "英语": "english", "语文": "chinese"} def tongji_paperid_of3(subject, period): print('-----------开始统计近3年的paperid------------') headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36", "Cookie": "hfs-session-id=eyJhbGciOiJIUzI1NiJ9.NjAzZjFmNmQwMDAwMDJkODY5MTE4MTc1LTE2MjE5MjY4NTk0NDQ.rtjc--lU6iK_VPzyIeCIzMJzE6wptKQ1gPO305Xd8y4; Hm_lvt_d9ce2e93fbe3e9d6109be3910c433855=1632822993,1632968130,1633671293,1633776623; tiku-session-id=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f97af9ccdcbc3ebfd4f01fdb8231d38820f6453bdab7f2d01c60807636827a3311d822be1011b8035a391f66470d1a8286aa119cb2b0722f51df33ea817fae5369bf0d766dae59f8609f91fb8874176d1590ebc984feced206b7db52b6db0f6bf43ff3a4a911c7775dc318456a04a790124e3f7f8cee36027290e237a3bd15968574af85602d4c11ff41e8febdb6edc986b271572d701e058355435233ced3b8414ecde9685ece8f796491626ddb6dc822b70bab9a59382e8d79735ebb9430a7ba8ec4e888e65c2fad86bab13142b1786bec2b4e4721947d6149438c5636670c00398bfbbf4da250d2f30a7f76e201920bc39d597d14d9f52f8596974fb0092d05d3e0fc0736711b7677037e6dd42235cd5; tiku-user-info=%7B%22id%22%3A1006589475%2C%22name%22%3A%22%E5%AD%A6%E7%94%9F589475%22%2C%22role%22%3A%22%E5%AD%A6%E7%94%9F%22%2C%22grade%22%3A%22%E5%88%9D%E4%B8%80%22%2C%22avatar%22%3A%22%22%2C%22schoolId%22%3A1000000001%2C%22schoolName%22%3A%22wxyunxiaozb%22%2C%22userId%22%3A1006589475%7D; tiku-api-key=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f976d4dc058f9749edfcbff02b6dc848ed01af9958693a087da089660a46e2e444711413443d4fa3c5c3889c1b7bbe3fa06ace39abc763a9a3df039ac2e0ea18d3562a5b4c8c92bb8adff722b6b4539d56a3d61c22d4194421a7ded4b3e92edd0eb58b47e0d77f9dd7d929f64b9c32998918b5dd2c7707b16b0c2db02d34ab58b5db930bb5a6129fa557b5d81491dff8cb9550b55b61ccdcaec181e9a1afd0e1cc9; tiku-is-vip=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f978399c9439a09eb516f60e3c48e28b80ee22927b8e8bfd2782f3080fd67ca7808b1f96836310c8ef59ebd1ec7060821d965fb4bd574a9d9c26f42df70a1343e1886f2341dc0be6c2d5e1098ffe9f17ad67be7004209106bc0d6cf5d85776794b56f45d8d04f5329e24a6bfeb5596b8dc6bfbc87c07f8c17e9393bb0140e22c39dadaa095a32cafb2fe878a4723627e455d7d1d871397cd0693376b6834585e6c0; Hm_lpvt_d9ce2e93fbe3e9d6109be3910c433855=1634105143" } headers1 = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36", "Cookie": "hfs-session-id=eyJhbGciOiJIUzI1NiJ9.NjEwYTVjZmMwMDAwMDI3YzEwZjFmYWM3LTE2MzQyNjM3OTA1NzI.RGGDPYzmOuqKsJhi_pn-fmlz0WiM6_Gs6uim_vmUlAI; Hm_lvt_d9ce2e93fbe3e9d6109be3910c433855=1632968130,1633671293,1633776623,1634263692; tiku-session-id=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f974a0042bbae64472f808b75abd3af377d3e7463d2cc8ad7acac3fe584a28a65449cdaf748a65903bbf81f8d8c0e62b895c3e7aac24d4e2a672d0595501b866fd9aac283fef5781acca3b75fb056fd173d503a912b912139d303aefaf45ff0405b34325bfa7dec6c04dc75efcfb212a9ce4f208ac79a41e1459f0ca0abf61762fa8bfdff14b89bcd0fac2d19a9aff227d6947e0a82a471eabeec9c2c1ba369030234d66f4d458654bcc3f2b5855d64bffe9938aa2b59566ce4801a53b0537fffce7d30f00a5e68dcfb2a823711300c6c32652c54352a109c946777528c568b05a2dc9dc87b67c5c2c6101fcdf70e2bd77e300f452cbb4b1d1697c39c18180cbdbf21f474332a5bdd8ec5a4b641288f48ebed1a4af66c1a1681fedf253dd548ee8d55e8aab70da2307c6de6ac4718c13039586b3fc161ff677716c69a3b90051c09; tiku-user-info=%7B%22id%22%3A27052753%2C%22name%22%3A%22%E5%BC%A0%E4%B8%89%22%2C%22role%22%3A%22%E5%AE%B6%E9%95%BF%22%2C%22grade%22%3A%22%E4%BA%94%E5%B9%B4%E7%BA%A7%22%2C%22avatar%22%3A%22%22%2C%22schoolId%22%3A3151%2C%22schoolName%22%3A%22%E6%83%A0%E5%B7%9E%E4%BB%B2%E6%81%BA%E9%AB%98%E6%96%B0%E5%8C%BA%E6%83%A0%E7%8E%AF%E5%AE%8F%E4%B8%9A%E4%B8%AD%E8%8B%B1%E6%96%87%E5%AD%A6%E6%A0%A1%22%2C%22userId%22%3A27052753%7D; tiku-api-key=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f976d4dc058f9749edfcbff02b6dc848ed070559cae5d2c2e355655e840dd83b4384a4da2396f55ea43f71cb2a7cb6bb9c9b2a2aab2106757daf37e13c2a2bcebf84a72d8a0d804772f68fcc6da7f3d67851b8604ca318e9366b6ed093e0317db5e8599b9b36a8ae58b15e8dd422386d613c05a605890a3957284bb41c0253b1d52af9c34a33553c604728417691d3b62258b02a3a4e3705bb287c21553e23cd803; tiku-is-vip=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f978399c9439a09eb516f60e3c48e28b80e394ee916f4b913eda44921aaca30414a3510010cac4b228ae648410d654b0296b2e010dd68a5a7345e648d60ad2b93232c4f7633a458e8b6cf3e8b947f0185c855610f587680ef0204363719919532e838f2bdf25ba3784c67588f9c85843b833db7c8c427edd28d32155f3f026fa9781cd6118d33a57366e2106cea0aa788e4; Hm_lpvt_d9ce2e93fbe3e9d6109be3910c433855=1634293233" } headers3 = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36", "Cookie": "hfs-session-id=eyJhbGciOiJIUzI1NiJ9.NjEwYTVjZmMwMDAwMDI3YzEwZjFmYWM3LTE2MzQyNjM3OTA1NzI.RGGDPYzmOuqKsJhi_pn-fmlz0WiM6_Gs6uim_vmUlAI; tiku-api-key=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f976d4dc058f9749edfcbff02b6dc848ed0136bc87bb8bc3886167b15e37e9f2f0447b483c5d31250e089cabff95f8393312c2945625844117b17a4516e601f7c010f18dcc8c34a2669997c8866da923cd0e49c5e841f05c0dd709c029fbb73c9b0c1066fe9d7948f8a8635c9748801a6f8b7ae45a6678bd5ed99804fb0963269edf7a780251fd79c5ee16de46fdd498f08c802a57f87d3035454b77838bf823912; tiku-is-vip=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f978399c9439a09eb516f60e3c48e28b80ed1ab8eab66acb25f0fa57046ef975f7c54a8680869ea22d601175d2396b8d8d0fb0660995ddac86010e054e6c6bf2755ee28b9b0e2de92c1dba3d2aaeaabbb765e61ba04e3d5797db9c1d4a697053f113cdf6cc9f30f47d9e9288f5d3a6ef339be965dc14566017ef2575ce4a17dc087449cd1d4e35a8d36224799722098a9f7106d3acbdf4365a0850f463f66fedc20; tiku-session-id=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f97af9ccdcbc3ebfd4f01fdb8231d38820f9a2089138618b87df1eac1e566642fcfe690eb28f5a9b407e857d0aeebb104b49e8e4cc2f5900a74662b8dd0acae33b0d8b5c21c580a962056f103c42a433c717d6f5c6f587a1857c7bb88c833a9c4940d0cba27c5519ad4a3fe0ab894cfc0ef11b95177c68a130867cd9fbe9e7fe25e5cf7c4499d833d4b8ab879254efe5a175c5dc52fc5d340a79fc8ee079e65cf50eb96bac54608cc9ef7485ace13a99a092b862818f4ee91c33daf8ce4666a9f49afda76df2a07d77ce65ae84079258ae29594374e9875780508466e576e0b8562c233d9ec83438429e7d935eb7a80ab965977e89db07b66c070196224da71875f96b9bad8ef36f3b10b2327f0fd9ae0df; Hm_lvt_d9ce2e93fbe3e9d6109be3910c433855=1633776623,1634263692,1634293439,1634522809; Hm_lpvt_d9ce2e93fbe3e9d6109be3910c433855=1634522809; tiku-user-info=%7B%22id%22%3A1007634677%2C%22name%22%3A%22%E5%AD%A6%E7%94%9F634677%22%2C%22role%22%3A%22%E5%AD%A6%E7%94%9F%22%2C%22grade%22%3A%22%E5%88%9D%E4%B8%80%22%2C%22avatar%22%3A%22%22%2C%22schoolId%22%3A1000000001%2C%22schoolName%22%3A%22wxyunxiaozb%22%2C%22userId%22%3A1007634677%7D", } headers4 = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36", "Cookie": "hfs-session-id=eyJhbGciOiJIUzI1NiJ9.NjEwYTVjZmMwMDAwMDI3YzEwZjFmYWM3LTE2MzQyNjM3OTA1NzI.RGGDPYzmOuqKsJhi_pn-fmlz0WiM6_Gs6uim_vmUlAI; Hm_lvt_d9ce2e93fbe3e9d6109be3910c433855=1634522809,1634610468,1634610570,1634639498; tiku-api-key=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f976d4dc058f9749edfcbff02b6dc848ed0c9306e2f1eead90480de7571585d27aa465ced1a7012d8f9da784902d38326a56f1d49f0e2ada153b76d7b7e9bbe0dc63c5584f345fe79e7a3816910199fc7060cb2d6467ec16ac8b0d31a559fc4ed0c070c128e4ffed6b4c44eddae96b1515eb441bcaa0e5ab5be2b3b3914d2cbcb5e94408794a177573a01042b30426b6a28cd8cfde950f7e3a22b599e64aa0b73d9; tiku-session-id=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f97af9ccdcbc3ebfd4f01fdb8231d38820fe061b66691a16cb864c259aaf91e4aa31931ad5a908235358816cf5eaec5240abf516a9285ae5a620953af472e4d5066a1ff8b598622f6f7fd092cd79c6309388721eb109d82aae51610d70065ce39bde6ae76fd2de94a1e12b6147f049e5778bd273c4388f06e61269b74207ee415bb4c52ebc1ab57f60e3b9152c634dfa0ed5a03726f172acb70f9d724211ec58026a78e6ecced5c6b8e77c8a8ccc3db0d87891d2a01a90938ed1b704acd8afb0e52ffd179aff3ee597e248f901b60df8a26c4fcf812d0f577d69cb11b5510de5aeb2a520cdde8dda9d49171070c5786b7f56c04260c03dbcc73ce5e970205bdafacd120d318b10100642978897eeb37dd68; tiku-is-vip=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f978399c9439a09eb516f60e3c48e28b80e2a2f4f8da03f8e321fd20ae6fe8943ec4046b4e4e12736586b79b6842c928c6984cc6bf2cf75e8df80d531f2d3dd038ffc75c7c43f5fae3cf1c5dcfe11ecfe2c79c017b2436b18e947b93771250c25c5a981bc146a0f9fd8b5d623436d1499a2f9ecc9e64fd59acb4f7e8ba00324b9cbf8266e2a6c38e59c77c4cda8e0601feafa7b051a1b083be37e60e53da3eec718; tiku-user-info=%7B%22id%22%3A1007634280%2C%22name%22%3A%22%E5%AD%A6%E7%94%9F634280%22%2C%22role%22%3A%22%E5%AE%B6%E9%95%BF%22%2C%22grade%22%3A%22%E5%88%9D%E4%B8%80%22%2C%22avatar%22%3A%22%22%2C%22schoolId%22%3A1000000001%2C%22schoolName%22%3A%22wxyunxiaozb%22%2C%22userId%22%3A1007634280%7D; Hm_lpvt_d9ce2e93fbe3e9d6109be3910c433855=1634641367", } all_paper_id = [] for year in [2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013]: data = {"period": period, "subject": subject, "to_year": year, "limit": 10, "offset": 0, "sort_by": "year"} i = 0 flag = True mistaken_times = 0 while flag: time.sleep(1) offset = 10 * i data['offset'] = offset try: time.sleep(1) res = requests.post(r'https://tiku.yunxiao.com/kb_api/v2/exampapers/by_search', headers=headers, data=data).text res_list = json.loads(res)['data']['exampapers'] if len(res_list) == 0: print('当前{}年的试卷完成'.format(year)) flag = False else: for item_dict in res_list: one_paper = {"paper_id": item_dict["id"], "grade": item_dict["grade"], "year": year, "type": item_dict["type"], "paper_name": item_dict["name"], "provinces": [], "province": "", "vague_name": "", "city": ""} if "provinces" in item_dict: one_paper["provinces"] = item_dict["provinces"] if "province" in item_dict: one_paper["province"] = item_dict["province"] if "city" in item_dict: one_paper["city"] = item_dict["city"] if "vague_name" in item_dict: one_paper["vague_name"] = item_dict["vague_name"] all_paper_id.append(one_paper) print(data['period'] + data['subject'] + '试卷id第' + str(i + 1) + '页统计完成') mistaken_times = 0 i = i + 1 except: mistaken_times = mistaken_times + 1 print(data['period'] + data['subject'] + '试卷id第' + str(i + 1) + '页获取异常') if mistaken_times > 5: flag = False print(data['period'] + data['subject'] + '试卷id统计完成') res = pd.DataFrame(all_paper_id) res.to_excel(r"F:\zwj\WL\structured_item_ruku\files\hfs\paperid_{}_junior.xlsx".format(sb_name[subject]), index=False) def img_info_sub(all_items, period, sj, itemid): """ 图片处理 根据subject字段+id字段获取图片 :return: """ # all_image1 = re.findall(r'()', all_items) all_image = re.findall(r'(|]*? src=[^【]*?/?>)', all_items) # print(all_image) # all_image = list(set(all_image1)) # 顺序打乱了,有问题 # all_image.sort(key=all_image1.index) imgurl_name = {"初中物理": "junior_phy", "初中数学": "junior_math", "初中化学": "junior_chem", "初中生物": "junior_biology"} src_basepath = my_config.SANTI_IMG_SAVE_PATH + "/{}_".format(imgurl_name[period+sj]) + str( itemid) + "/" # src_basepath = my_config.SANTI_IMG_SAVE_PATH + "/physical_" + str(self.info["id"]) + "/" is_src_fail = 0 if all_image: # all_image = [img[0] for img in all_image] # 判断某图片文件夹下的文件个数 try: # ims_count = requests.get(my_config.GET_SANTI_IMG_AMOUNT + "/physical_" + str(self.info["id"])).text ims_count = requests.get(my_config.GET_SANTI_IMG_AMOUNT + "/{}_".format(imgurl_name[period+sj]) + str(itemid)).text except: ims_count = 0 # print(len(all_image), '-----', ims_count) if ims_count==0 or len(all_image) != int(ims_count): return "失败" return "" def tongji_itemid_han_img(sj, period): """ 统计含图片的试题的id :return: """ mongo_santi = pymongo.MongoClient(host='49.232.97.180', port=8888, username='root', password='oyiqd!oy@wxc=ykw@2*jei!') meta_db_santi = mongo_santi["item"]["item_spider"] query = { "subject": sj, "period": period, # "blocks.answers": {"$exists": True}, # "item_id_zxhx": {"$exists": True}, # "status_zyk": {"$exists": False}, # "item_id_zxhx": "201511100187890", # "status": {"$exists": False}, # "再解析" # "year": {"$gt": 2018}, # "type": {"$notin": ["阅读理解","完形填空", "书面表达",]}, # 语法填空---选词填空 完形填空 阅读理解 七选五 # "type": "语法填空", # 单选题 # "id": 2899530948, # 4156438783 不要 # "id": {"$nin": re_id2}, # "id": {"$nin": [4156438783]}, } for info in meta_db_santi.find(query): blocks = info["blocks"] item_id = info["id"] print(item_id) item_type = info["type"] stems_list = [i["stem"] for i in blocks["stems"]] # 默认肯定有“stem” com_stem = info["description"] if "description" in info and \ info["description"] else "" # 公共题干 stems_str = com_stem + "【设问】" + "【
】".join(stems_list) # 题干,默认先按没有选项的处理 # 选项和题干 options = [[]] * len(stems_list) if "options" in str(blocks["stems"]): # 大题也会有选项的情况 options = [list(i["options"].values()) if "options" in i else [] for i in blocks["stems"]] # 对大题的选项和题干重新处理 # 大题也会有选项的情况 if item_type in ["解答题", "实验题"]: # 带选项的大题不拆分 if len(stems_list) > 1: stems_str = [i["stem"] + "
" + "
".join(i["options"]) if "options" in i else i["stem"] for i in blocks["stems"]] stems_str = com_stem + "【设问】" + "【
】".join(stems_str) options = [[]] * len(stems_list) else: stems_str = com_stem + "【设问】" + blocks["stems"][0]["stem"] + "
" + "
".join( blocks["stems"][0]["options"]) options = [[]] * len(stems_list) if not stems_str.strip(): # 无题干的不入库 return "题干空" solutions = "【
】".join(blocks["solutions"]) if "solutions" in blocks else "" # 解析 # explanations = "
".join(blocks["explanations"]) if "explanations" in blocks else "" # 分析 if "solutions" in blocks and "answers" in blocks: have_sol_list = [1 if i else 0 for i in blocks["solutions"]] # 是否有解析标志 # 答案 answers = [] # errmsgs = "" have_err_num = 0 for k, ans in enumerate(blocks["answers"]): new_ans = ans if type(ans) == list: if len("".join(ans)) <= 8: new_ans = "、".join(ans) else: new_ans = ";".join(ans) if not new_ans and have_sol_list[k]: # 默认答案个数和解析个数相等 new_ans = "见解析" elif not new_ans and not have_sol_list[k]: # errmsgs = "本题缺少部分答案和解析" have_err_num += 1 elif len(have_sol_list) == 1 and new_ans and not have_sol_list[0]: # 无解析, new_ans条件可不要 solutions = "略" # errmsgs = "本题缺少解析" answers.append(new_ans) else: answers = [] if "answers" in blocks: try: answers = sum(blocks["answers"], []) except: answers = blocks["answers"] explanations = "
".join(blocks["explanations"]) if "explanations" in blocks else "" # 分析 options = ["【@1@】".join(op) for op in options] all_items = "【@@】".join([stems_str, "【@2@】".join(options), "【!】".join(answers), solutions, explanations]) # ---------图片处理和公式处理--------------------------- all_items = img_info_sub(all_items, period, sj, item_id) # 图片处理 # 出现图片错误,下一个 if all_items == "失败": with open(r"F:\zwj\WL\structured_item_ruku\to_zyk\logs\junior_{}_src_fail_id_10-22.txt".format(sb_name[sj]), 'a+', encoding='utf-8') as f1: f1.write('{}\n'.format(item_id)) f1.close() with open(r"F:\zwj\WL\structured_item_ruku\to_zyk\logs\junior_{}_all_id_10-22.txt".format(sb_name[sj]), 'a+', encoding='utf-8') as f2: f2.write('{}\n'.format(item_id)) f2.close() if __name__ == '__main__': import re from pprint import pprint # tongji_paperid_of3("英语", "初中") tongji_itemid_han_img("化学", "初中") # ------------------------------------------------------------------------------------ # df1 = pd.read_excel(r"F:\zwj\WL\structured_item_ruku\files\hfs\paperid_math_junior.xlsx") # df1 = df1[df1["grade"] == "中考专题"] # paper_ids = df1["paper_id"].tolist() # # -------------------------------------------------------------------------------------- # rest_ids = [] # res = json.loads(open(r"F:\zwj\WL\structured_item_ruku\files\hfs\junior_math.json").read()) # for id in paper_ids: # if id not in res['ids']: # rest_ids.append(id) # # print(rest_ids) # res_ids = {'ids': rest_ids} # re_f = open(r"F:\zwj\WL\structured_item_ruku\files\hfs\paperid_math_中考_junior.json", 'w', encoding='utf-8') # json.dump(res_ids, re_f) # f_lists = open(r"F:\zwj\WL\structured_item_ruku\files\hfs\all_citys.txt",'r', encoding="utf-8").readlines() # city_dd = {} # for i, v in enumerate(f_lists): # if i%2 == 1: # # print(v) # city_dd[f_lists[i-1].strip()] = re.split("\s+", v.strip()) # pprint(city_dd) # provinces = sum([[k] * len(v2) for k, v2 in city_dd.items()], []) # citys = [] # print(provinces, len(provinces)) # print(sum(city_dd.values(), []), len(sum(city_dd.values(),[]))) # res = pd.DataFrame({"省份、直辖市、自治区": provinces, "市": sum(city_dd.values(), [])}) # res.to_excel(r"F:\zwj\WL\structured_item_ruku\files\hfs\资源获取统计.xlsx", # index=False)