get_items_info_from_hfs.py 20 KB


  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. import time
  4. import requests
  5. import json
  6. import pandas as pd
  7. import my_config
  8. import pymongo
  9. sb_name = {"物理": "phy", "化学": "chem", "生物": "biology", "政治": "politics", "历史": "history", "地理": "geography",
  10. "数学": "math", "英语": "english", "语文": "chinese"}
  11. def tongji_paperid_of3(subject, period):
  12. print('-----------开始统计近3年的paperid------------')
  13. headers = {
  14. "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",
  15. "Cookie": "hfs-session-id=eyJhbGciOiJIUzI1NiJ9.NjAzZjFmNmQwMDAwMDJkODY5MTE4MTc1LTE2MjE5MjY4NTk0NDQ.rtjc--lU6iK_VPzyIeCIzMJzE6wptKQ1gPO305Xd8y4; Hm_lvt_d9ce2e93fbe3e9d6109be3910c433855=1632822993,1632968130,1633671293,1633776623; tiku-session-id=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f97af9ccdcbc3ebfd4f01fdb8231d38820f6453bdab7f2d01c60807636827a3311d822be1011b8035a391f66470d1a8286aa119cb2b0722f51df33ea817fae5369bf0d766dae59f8609f91fb8874176d1590ebc984feced206b7db52b6db0f6bf43ff3a4a911c7775dc318456a04a790124e3f7f8cee36027290e237a3bd15968574af85602d4c11ff41e8febdb6edc986b271572d701e058355435233ced3b8414ecde9685ece8f796491626ddb6dc822b70bab9a59382e8d79735ebb9430a7ba8ec4e888e65c2fad86bab13142b1786bec2b4e4721947d6149438c5636670c00398bfbbf4da250d2f30a7f76e201920bc39d597d14d9f52f8596974fb0092d05d3e0fc0736711b7677037e6dd42235cd5; tiku-user-info=%7B%22id%22%3A1006589475%2C%22name%22%3A%22%E5%AD%A6%E7%94%9F589475%22%2C%22role%22%3A%22%E5%AD%A6%E7%94%9F%22%2C%22grade%22%3A%22%E5%88%9D%E4%B8%80%22%2C%22avatar%22%3A%22%22%2C%22schoolId%22%3A1000000001%2C%22schoolName%22%3A%22wxyunxiaozb%22%2C%22userId%22%3A1006589475%7D; tiku-api-key=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f976d4dc058f9749edfcbff02b6dc848ed01af9958693a087da089660a46e2e444711413443d4fa3c5c3889c1b7bbe3fa06ace39abc763a9a3df039ac2e0ea18d3562a5b4c8c92bb8adff722b6b4539d56a3d61c22d4194421a7ded4b3e92edd0eb58b47e0d77f9dd7d929f64b9c32998918b5dd2c7707b16b0c2db02d34ab58b5db930bb5a6129fa557b5d81491dff8cb9550b55b61ccdcaec181e9a1afd0e1cc9; tiku-is-vip=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f978399c9439a09eb516f60e3c48e28b80ee22927b8e8bfd2782f3080fd67ca7808b1f96836310c8ef59ebd1ec7060821d965fb4bd574a9d9c26f42df70a1343e1886f2341dc0be6c2d5e1098ffe9f17ad67be7004209106bc0d6cf5d85776794b56f45d8d04f5329e24a6bfeb5596b8dc6bfbc87c07f8c17e9393bb0140e22c39dadaa095a32cafb2fe878a4723627e455d7d1d871397cd0693376b6834585e6c0; Hm_lpvt_d9ce2e93fbe3e9d6109be3910c433855=1634105143"
  16. }
  17. headers1 = {
  18. "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
  19. "Cookie": "hfs-session-id=eyJhbGciOiJIUzI1NiJ9.NjEwYTVjZmMwMDAwMDI3YzEwZjFmYWM3LTE2MzQyNjM3OTA1NzI.RGGDPYzmOuqKsJhi_pn-fmlz0WiM6_Gs6uim_vmUlAI; Hm_lvt_d9ce2e93fbe3e9d6109be3910c433855=1632968130,1633671293,1633776623,1634263692; tiku-session-id=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f974a0042bbae64472f808b75abd3af377d3e7463d2cc8ad7acac3fe584a28a65449cdaf748a65903bbf81f8d8c0e62b895c3e7aac24d4e2a672d0595501b866fd9aac283fef5781acca3b75fb056fd173d503a912b912139d303aefaf45ff0405b34325bfa7dec6c04dc75efcfb212a9ce4f208ac79a41e1459f0ca0abf61762fa8bfdff14b89bcd0fac2d19a9aff227d6947e0a82a471eabeec9c2c1ba369030234d66f4d458654bcc3f2b5855d64bffe9938aa2b59566ce4801a53b0537fffce7d30f00a5e68dcfb2a823711300c6c32652c54352a109c946777528c568b05a2dc9dc87b67c5c2c6101fcdf70e2bd77e300f452cbb4b1d1697c39c18180cbdbf21f474332a5bdd8ec5a4b641288f48ebed1a4af66c1a1681fedf253dd548ee8d55e8aab70da2307c6de6ac4718c13039586b3fc161ff677716c69a3b90051c09; tiku-user-info=%7B%22id%22%3A27052753%2C%22name%22%3A%22%E5%BC%A0%E4%B8%89%22%2C%22role%22%3A%22%E5%AE%B6%E9%95%BF%22%2C%22grade%22%3A%22%E4%BA%94%E5%B9%B4%E7%BA%A7%22%2C%22avatar%22%3A%22%22%2C%22schoolId%22%3A3151%2C%22schoolName%22%3A%22%E6%83%A0%E5%B7%9E%E4%BB%B2%E6%81%BA%E9%AB%98%E6%96%B0%E5%8C%BA%E6%83%A0%E7%8E%AF%E5%AE%8F%E4%B8%9A%E4%B8%AD%E8%8B%B1%E6%96%87%E5%AD%A6%E6%A0%A1%22%2C%22userId%22%3A27052753%7D; tiku-api-key=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f976d4dc058f9749edfcbff02b6dc848ed070559cae5d2c2e355655e840dd83b4384a4da2396f55ea43f71cb2a7cb6bb9c9b2a2aab2106757daf37e13c2a2bcebf84a72d8a0d804772f68fcc6da7f3d67851b8604ca318e9366b6ed093e0317db5e8599b9b36a8ae58b15e8dd422386d613c05a605890a3957284bb41c0253b1d52af9c34a33553c604728417691d3b62258b02a3a4e3705bb287c21553e23cd803; tiku-is-vip=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f978399c9439a09eb516f60e3c48e28b80e394ee916f4b913eda44921aaca30414a3510010cac4b228ae648410d654b0296b2e010dd68a5a7345e648d60ad2b93232c4f7633a458e8b6cf3e8b947f0185c855610f587680ef0204363719919532e838f2bdf25ba3784c67588f9c85843b833db7c8c427edd28d32155f3f026fa9781cd6118d33a57366e2106cea0aa788e4; Hm_lpvt_d9ce2e93fbe3e9d6109be3910c433855=1634293233"
  20. }
  21. headers3 = {
  22. "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
  23. "Cookie": "hfs-session-id=eyJhbGciOiJIUzI1NiJ9.NjEwYTVjZmMwMDAwMDI3YzEwZjFmYWM3LTE2MzQyNjM3OTA1NzI.RGGDPYzmOuqKsJhi_pn-fmlz0WiM6_Gs6uim_vmUlAI; tiku-api-key=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f976d4dc058f9749edfcbff02b6dc848ed0136bc87bb8bc3886167b15e37e9f2f0447b483c5d31250e089cabff95f8393312c2945625844117b17a4516e601f7c010f18dcc8c34a2669997c8866da923cd0e49c5e841f05c0dd709c029fbb73c9b0c1066fe9d7948f8a8635c9748801a6f8b7ae45a6678bd5ed99804fb0963269edf7a780251fd79c5ee16de46fdd498f08c802a57f87d3035454b77838bf823912; tiku-is-vip=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f978399c9439a09eb516f60e3c48e28b80ed1ab8eab66acb25f0fa57046ef975f7c54a8680869ea22d601175d2396b8d8d0fb0660995ddac86010e054e6c6bf2755ee28b9b0e2de92c1dba3d2aaeaabbb765e61ba04e3d5797db9c1d4a697053f113cdf6cc9f30f47d9e9288f5d3a6ef339be965dc14566017ef2575ce4a17dc087449cd1d4e35a8d36224799722098a9f7106d3acbdf4365a0850f463f66fedc20; tiku-session-id=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f97af9ccdcbc3ebfd4f01fdb8231d38820f9a2089138618b87df1eac1e566642fcfe690eb28f5a9b407e857d0aeebb104b49e8e4cc2f5900a74662b8dd0acae33b0d8b5c21c580a962056f103c42a433c717d6f5c6f587a1857c7bb88c833a9c4940d0cba27c5519ad4a3fe0ab894cfc0ef11b95177c68a130867cd9fbe9e7fe25e5cf7c4499d833d4b8ab879254efe5a175c5dc52fc5d340a79fc8ee079e65cf50eb96bac54608cc9ef7485ace13a99a092b862818f4ee91c33daf8ce4666a9f49afda76df2a07d77ce65ae84079258ae29594374e9875780508466e576e0b8562c233d9ec83438429e7d935eb7a80ab965977e89db07b66c070196224da71875f96b9bad8ef36f3b10b2327f0fd9ae0df; Hm_lvt_d9ce2e93fbe3e9d6109be3910c433855=1633776623,1634263692,1634293439,1634522809; Hm_lpvt_d9ce2e93fbe3e9d6109be3910c433855=1634522809; tiku-user-info=%7B%22id%22%3A1007634677%2C%22name%22%3A%22%E5%AD%A6%E7%94%9F634677%22%2C%22role%22%3A%22%E5%AD%A6%E7%94%9F%22%2C%22grade%22%3A%22%E5%88%9D%E4%B8%80%22%2C%22avatar%22%3A%22%22%2C%22schoolId%22%3A1000000001%2C%22schoolName%22%3A%22wxyunxiaozb%22%2C%22userId%22%3A1007634677%7D",
  24. }
  25. headers4 = {
  26. "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
  27. "Cookie": "hfs-session-id=eyJhbGciOiJIUzI1NiJ9.NjEwYTVjZmMwMDAwMDI3YzEwZjFmYWM3LTE2MzQyNjM3OTA1NzI.RGGDPYzmOuqKsJhi_pn-fmlz0WiM6_Gs6uim_vmUlAI; Hm_lvt_d9ce2e93fbe3e9d6109be3910c433855=1634522809,1634610468,1634610570,1634639498; tiku-api-key=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f976d4dc058f9749edfcbff02b6dc848ed0c9306e2f1eead90480de7571585d27aa465ced1a7012d8f9da784902d38326a56f1d49f0e2ada153b76d7b7e9bbe0dc63c5584f345fe79e7a3816910199fc7060cb2d6467ec16ac8b0d31a559fc4ed0c070c128e4ffed6b4c44eddae96b1515eb441bcaa0e5ab5be2b3b3914d2cbcb5e94408794a177573a01042b30426b6a28cd8cfde950f7e3a22b599e64aa0b73d9; tiku-session-id=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f97af9ccdcbc3ebfd4f01fdb8231d38820fe061b66691a16cb864c259aaf91e4aa31931ad5a908235358816cf5eaec5240abf516a9285ae5a620953af472e4d5066a1ff8b598622f6f7fd092cd79c6309388721eb109d82aae51610d70065ce39bde6ae76fd2de94a1e12b6147f049e5778bd273c4388f06e61269b74207ee415bb4c52ebc1ab57f60e3b9152c634dfa0ed5a03726f172acb70f9d724211ec58026a78e6ecced5c6b8e77c8a8ccc3db0d87891d2a01a90938ed1b704acd8afb0e52ffd179aff3ee597e248f901b60df8a26c4fcf812d0f577d69cb11b5510de5aeb2a520cdde8dda9d49171070c5786b7f56c04260c03dbcc73ce5e970205bdafacd120d318b10100642978897eeb37dd68; tiku-is-vip=6719da848e0ec864081ea55f28ff77ab197f90829085338f8d5b7871aeae4f978399c9439a09eb516f60e3c48e28b80e2a2f4f8da03f8e321fd20ae6fe8943ec4046b4e4e12736586b79b6842c928c6984cc6bf2cf75e8df80d531f2d3dd038ffc75c7c43f5fae3cf1c5dcfe11ecfe2c79c017b2436b18e947b93771250c25c5a981bc146a0f9fd8b5d623436d1499a2f9ecc9e64fd59acb4f7e8ba00324b9cbf8266e2a6c38e59c77c4cda8e0601feafa7b051a1b083be37e60e53da3eec718; tiku-user-info=%7B%22id%22%3A1007634280%2C%22name%22%3A%22%E5%AD%A6%E7%94%9F634280%22%2C%22role%22%3A%22%E5%AE%B6%E9%95%BF%22%2C%22grade%22%3A%22%E5%88%9D%E4%B8%80%22%2C%22avatar%22%3A%22%22%2C%22schoolId%22%3A1000000001%2C%22schoolName%22%3A%22wxyunxiaozb%22%2C%22userId%22%3A1007634280%7D; Hm_lpvt_d9ce2e93fbe3e9d6109be3910c433855=1634641367",
  28. }
  29. all_paper_id = []
  30. for year in [2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013]:
  31. data = {"period": period, "subject": subject, "to_year": year, "limit": 10, "offset": 0, "sort_by": "year"}
  32. i = 0
  33. flag = True
  34. mistaken_times = 0
  35. while flag:
  36. time.sleep(1)
  37. offset = 10 * i
  38. data['offset'] = offset
  39. try:
  40. time.sleep(1)
  41. res = requests.post(r'https://tiku.yunxiao.com/kb_api/v2/exampapers/by_search',
  42. headers=headers, data=data).text
  43. res_list = json.loads(res)['data']['exampapers']
  44. if len(res_list) == 0:
  45. print('当前{}年的试卷完成'.format(year))
  46. flag = False
  47. else:
  48. for item_dict in res_list:
  49. one_paper = {"paper_id": item_dict["id"],
  50. "grade": item_dict["grade"],
  51. "year": year,
  52. "type": item_dict["type"],
  53. "paper_name": item_dict["name"],
  54. "provinces": [],
  55. "province": "",
  56. "vague_name": "",
  57. "city": ""}
  58. if "provinces" in item_dict:
  59. one_paper["provinces"] = item_dict["provinces"]
  60. if "province" in item_dict:
  61. one_paper["province"] = item_dict["province"]
  62. if "city" in item_dict:
  63. one_paper["city"] = item_dict["city"]
  64. if "vague_name" in item_dict:
  65. one_paper["vague_name"] = item_dict["vague_name"]
  66. all_paper_id.append(one_paper)
  67. print(data['period'] + data['subject'] + '试卷id第' + str(i + 1) + '页统计完成')
  68. mistaken_times = 0
  69. i = i + 1
  70. except:
  71. mistaken_times = mistaken_times + 1
  72. print(data['period'] + data['subject'] + '试卷id第' + str(i + 1) + '页获取异常')
  73. if mistaken_times > 5:
  74. flag = False
  75. print(data['period'] + data['subject'] + '试卷id统计完成')
  76. res = pd.DataFrame(all_paper_id)
  77. res.to_excel(r"F:\zwj\WL\structured_item_ruku\files\hfs\paperid_{}_junior.xlsx".format(sb_name[subject]), index=False)
  78. def img_info_sub(all_items, period, sj, itemid):
  79. """
  80. 图片处理
  81. 根据subject字段+id字段获取图片
  82. :return:
  83. """
  84. # all_image1 = re.findall(r'(<img( (alt|height|width|title)=.*?| (alt|height|width|title))? src=.*?/?>)', all_items)
  85. all_image = re.findall(r'(<img src=[^【]*?/?>|<img [^<>]*? src=[^【]*?/?>)', all_items)
  86. # print(all_image)
  87. # all_image = list(set(all_image1)) # 顺序打乱了,有问题
  88. # all_image.sort(key=all_image1.index)
  89. imgurl_name = {"初中物理": "junior_phy", "初中数学": "junior_math", "初中化学": "junior_chem",
  90. "初中生物": "junior_biology"}
  91. src_basepath = my_config.SANTI_IMG_SAVE_PATH + "/{}_".format(imgurl_name[period+sj]) + str(
  92. itemid) + "/"
  93. # src_basepath = my_config.SANTI_IMG_SAVE_PATH + "/physical_" + str(self.info["id"]) + "/"
  94. is_src_fail = 0
  95. if all_image:
  96. # all_image = [img[0] for img in all_image]
  97. # 判断某图片文件夹下的文件个数
  98. try:
  99. # ims_count = requests.get(my_config.GET_SANTI_IMG_AMOUNT + "/physical_" + str(self.info["id"])).text
  100. ims_count = requests.get(my_config.GET_SANTI_IMG_AMOUNT + "/{}_".format(imgurl_name[period+sj]) + str(itemid)).text
  101. except:
  102. ims_count = 0
  103. # print(len(all_image), '-----', ims_count)
  104. if ims_count==0 or len(all_image) != int(ims_count):
  105. return "失败"
  106. return ""
  107. def tongji_itemid_han_img(sj, period):
  108. """
  109. 统计含图片的试题的id
  110. :return:
  111. """
  112. mongo_santi = pymongo.MongoClient(host='49.232.97.180', port=8888, username='root',
  113. password='oyiqd!oy@wxc=ykw@2*jei!')
  114. meta_db_santi = mongo_santi["item"]["item_spider"]
  115. query = {
  116. "subject": sj,
  117. "period": period,
  118. # "blocks.answers": {"$exists": True},
  119. # "item_id_zxhx": {"$exists": True},
  120. # "status_zyk": {"$exists": False},
  121. # "item_id_zxhx": "201511100187890",
  122. # "status": {"$exists": False}, # "再解析"
  123. # "year": {"$gt": 2018},
  124. # "type": {"$notin": ["阅读理解","完形填空", "书面表达",]}, # 语法填空---选词填空 完形填空 阅读理解 七选五
  125. # "type": "语法填空", # 单选题
  126. # "id": 2899530948, # 4156438783 不要
  127. # "id": {"$nin": re_id2},
  128. # "id": {"$nin": [4156438783]},
  129. }
  130. for info in meta_db_santi.find(query):
  131. blocks = info["blocks"]
  132. item_id = info["id"]
  133. print(item_id)
  134. item_type = info["type"]
  135. stems_list = [i["stem"] for i in blocks["stems"]] # 默认肯定有“stem”
  136. com_stem = info["description"] if "description" in info and \
  137. info["description"] else "" # 公共题干
  138. stems_str = com_stem + "【设问】" + "【<br/>】".join(stems_list) # 题干,默认先按没有选项的处理
  139. # 选项和题干
  140. options = [[]] * len(stems_list)
  141. if "options" in str(blocks["stems"]): # 大题也会有选项的情况
  142. options = [list(i["options"].values()) if "options" in i else [] for i in blocks["stems"]]
  143. # 对大题的选项和题干重新处理 # 大题也会有选项的情况
  144. if item_type in ["解答题", "实验题"]: # 带选项的大题不拆分
  145. if len(stems_list) > 1:
  146. stems_str = [i["stem"] + "<br/>" + "<br/>".join(i["options"])
  147. if "options" in i else i["stem"] for i in blocks["stems"]]
  148. stems_str = com_stem + "【设问】" + "【<br/>】".join(stems_str)
  149. options = [[]] * len(stems_list)
  150. else:
  151. stems_str = com_stem + "【设问】" + blocks["stems"][0]["stem"] + "<br/>" + "<br/>".join(
  152. blocks["stems"][0]["options"])
  153. options = [[]] * len(stems_list)
  154. if not stems_str.strip(): # 无题干的不入库
  155. return "题干空"
  156. solutions = "【<br/>】".join(blocks["solutions"]) if "solutions" in blocks else "" # 解析
  157. # explanations = "<br/>".join(blocks["explanations"]) if "explanations" in blocks else "" # 分析
  158. if "solutions" in blocks and "answers" in blocks:
  159. have_sol_list = [1 if i else 0 for i in blocks["solutions"]] # 是否有解析标志
  160. # 答案
  161. answers = []
  162. # errmsgs = ""
  163. have_err_num = 0
  164. for k, ans in enumerate(blocks["answers"]):
  165. new_ans = ans
  166. if type(ans) == list:
  167. if len("".join(ans)) <= 8:
  168. new_ans = "、".join(ans)
  169. else:
  170. new_ans = ";".join(ans)
  171. if not new_ans and have_sol_list[k]: # 默认答案个数和解析个数相等
  172. new_ans = "见解析"
  173. elif not new_ans and not have_sol_list[k]:
  174. # errmsgs = "本题缺少部分答案和解析"
  175. have_err_num += 1
  176. elif len(have_sol_list) == 1 and new_ans and not have_sol_list[0]: # 无解析, new_ans条件可不要
  177. solutions = "略"
  178. # errmsgs = "本题缺少解析"
  179. answers.append(new_ans)
  180. else:
  181. answers = []
  182. if "answers" in blocks:
  183. try:
  184. answers = sum(blocks["answers"], [])
  185. except:
  186. answers = blocks["answers"]
  187. explanations = "<br/>".join(blocks["explanations"]) if "explanations" in blocks else "" # 分析
  188. options = ["【@1@】".join(op) for op in options]
  189. all_items = "【@@】".join([stems_str, "【@2@】".join(options), "【!】".join(answers), solutions, explanations])
  190. # ---------图片处理和公式处理---------------------------
  191. all_items = img_info_sub(all_items, period, sj, item_id) # 图片处理 # 出现图片错误,下一个
  192. if all_items == "失败":
  193. with open(r"F:\zwj\WL\structured_item_ruku\to_zyk\logs\junior_{}_src_fail_id_10-22.txt".format(sb_name[sj]), 'a+', encoding='utf-8') as f1:
  194. f1.write('{}\n'.format(item_id))
  195. f1.close()
  196. with open(r"F:\zwj\WL\structured_item_ruku\to_zyk\logs\junior_{}_all_id_10-22.txt".format(sb_name[sj]), 'a+',
  197. encoding='utf-8') as f2:
  198. f2.write('{}\n'.format(item_id))
  199. f2.close()
  200. if __name__ == '__main__':
  201. import re
  202. from pprint import pprint
  203. # tongji_paperid_of3("英语", "初中")
  204. tongji_itemid_han_img("化学", "初中")
  205. # ------------------------------------------------------------------------------------
  206. # df1 = pd.read_excel(r"F:\zwj\WL\structured_item_ruku\files\hfs\paperid_math_junior.xlsx")
  207. # df1 = df1[df1["grade"] == "中考专题"]
  208. # paper_ids = df1["paper_id"].tolist()
  209. # # --------------------------------------------------------------------------------------
  210. # rest_ids = []
  211. # res = json.loads(open(r"F:\zwj\WL\structured_item_ruku\files\hfs\junior_math.json").read())
  212. # for id in paper_ids:
  213. # if id not in res['ids']:
  214. # rest_ids.append(id)
  215. # # print(rest_ids)
  216. # res_ids = {'ids': rest_ids}
  217. # re_f = open(r"F:\zwj\WL\structured_item_ruku\files\hfs\paperid_math_中考_junior.json", 'w', encoding='utf-8')
  218. # json.dump(res_ids, re_f)
  219. # f_lists = open(r"F:\zwj\WL\structured_item_ruku\files\hfs\all_citys.txt",'r', encoding="utf-8").readlines()
  220. # city_dd = {}
  221. # for i, v in enumerate(f_lists):
  222. # if i%2 == 1:
  223. # # print(v)
  224. # city_dd[f_lists[i-1].strip()] = re.split("\s+", v.strip())
  225. # pprint(city_dd)
  226. # provinces = sum([[k] * len(v2) for k, v2 in city_dd.items()], [])
  227. # citys = []
  228. # print(provinces, len(provinces))
  229. # print(sum(city_dd.values(), []), len(sum(city_dd.values(),[])))
  230. # res = pd.DataFrame({"省份、直辖市、自治区": provinces, "市": sum(city_dd.values(), [])})
  231. # res.to_excel(r"F:\zwj\WL\structured_item_ruku\files\hfs\资源获取统计.xlsx",
  232. # index=False)