option.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. import re, os
  4. import configs
  5. from utils.washutil import table_label_cleal
  6. import numpy as np
  7. from PIL import Image
  8. def option2block(option_con, item_no_type):
  9. """
  10. 选择题选项切分
  11. 对于选项切分部分,最好也像题号一样先自我切分纠错,但这样老师如果手误打错了字母,可能就解析出错!!!!!
  12. :return:
  13. """
  14. def del_table(ss):
  15. ss = re.sub(r"</?t[dr]>|</?tbody>|</?table>|</?div>|</?p>", "", ss.replace("<td><p>", " "))
  16. return ss
  17. # print('***********',option_con)
  18. if '<table><tbody><tr>' in option_con and \
  19. len(re.findall('<tr><td><p>(A\s*[..、、::].+?|\(A\)\s*[..、、]?.+?)</tr>', option_con.strip())) == 1:
  20. st_opt = re.search('<table><tbody><tr><td><p>(A\s*[..、、::].+?|\(A\)\s*[..、、]?.+?)</tr>',
  21. option_con.strip()).start()
  22. option_con = option_con.strip()[0:st_opt] + '\n' + del_table(option_con.strip()[st_opt:])
  23. # print("option_con:", option_con)
  24. option_con = re.sub(r"</table>\n*\s*(<p>)?\s*(A\s*[、、..::]|\(A\)\s*[、、..]?)(.+?)", r"</table>【【A、】】\3",
  25. option_con, flags=re.S)
  26. if re.search("\n\s*C", option_con) is None and re.search("\n\s*c", option_con):
  27. option_con = re.sub("\n\s*c", "\nC", option_con)
  28. # option_con = re.sub(r"(\n\s*(<img\s*src=\".*?\"\s(width|height|eq-code|data-latex|ocr-latex)=.*?[\"/]>\s*)+?\s*)(A[、、..::].+?)", r"\1\n\3", option_con.strip())
  29. option_con = re.sub(r"(\n\s*(<img\s*src=((?![/>]).)*?/>\s*)+?\s*)(A[、、..::].+?)", r"\1\n\4", option_con.strip())
  30. con = re.sub(r"\n\s*([A-H])\s*[、、..::](.+?)", r"\n【【\1、】】\2", option_con.strip()) # 行首的A、不能考虑,故得用strip
  31. if item_no_type == 1 and len(re.findall(r'【【[A-H]\s*[..、、]】】', con)) <= 2 and \
  32. len(re.findall(r'\([A-H]\)', con)) > 2: # 针对题干是第一种类型,选项是第二种类型的情况
  33. item_no_type = 2
  34. if item_no_type == 2:
  35. con = re.sub(r"\n\s*\(([A-Hc])\)\s*[、、..]?(.+?)", r"\n【【\1、】】\2", option_con)
  36. # print(11111,option_con)
  37. if item_no_type == 1:
  38. if len(re.findall(r'【【[A-H]\s*[..、、]】】', con)) <= 3:
  39. while re.search(r"\n\s*[A-H]\s*<img\s*src=.+?", con.replace(" ", "")): # 2020/7/15
  40. con = re.sub(r"\n\s*([A-H])\s*(<img\s*src=.+?)", "\n" + r"【【\1、】】\2", con)
  41. while re.search(r"(\n\s*<img\s*src=.+?)([A-H][..、、])(.+?)", con.replace(" ", "")):
  42. con = re.sub(r"(\n\s*<img\s*src=.+?)(?<!【)([A-H]\s*[..、、::])(.+?)", r"\1" + "\n" + r"【【\2】】\3", con)
  43. while re.search(r"(\n【【[A-H][..、、]】】.+?)(?<!【)([A-Hc][..、、])\n+(.+?)(?<!【)([A-H][..、、])(.+?)",
  44. con.replace(" ", ""), re.S):
  45. con = re.sub(r"(\n\s*【【[A-H]\s*[..、、]】】.+?)(?<!【)([A-H]\s*[..、、::])\s*\n+(.+?)"
  46. r"(?<!【)([A-H]\s*[..、、::])(.+?)", r"\1【【\2】】\3【【\4】】\5", con, flags=re.S)
  47. while re.search(r"(\n【【[A-H][..、、]】】.+?)(?<!【)([A-H][..、、])\n+(.+?)", con.replace(" ", ""), re.S):
  48. con = re.sub(r"(\n\s*【【[A-H]\s*[..、、]】】.+?)(?<!【)([A-H]\s*[..、、::])\s*\n+(.+?)",
  49. r"\1【【\2】】\3", con, flags=re.S)
  50. while re.search(r"(\n【【[A-H][..、、]】】.+?)(?<!【)([A-H][..、、])(.+?)", con.replace(" ", "")):
  51. con = re.sub(r"(\n\s*【【[A-H]\s*[..、、]】】.+?)(?<!【)([A-H]\s*[..、、::])(.+?)", r"\1【【\2】】\3", con)
  52. while re.search(r"(\n【【[A-H][..、、]】】[^【]+?/>\s+)(?<!【)([B-H][..、、])(.+?)", con.replace(" ", ""), re.S):
  53. con = re.sub(r"(\n\s*【【[A-H]\s*[..、、]】】[^【]+?/>\s+)(?<!【)([B-H]\s*[..、、::])\s*(.+?)",
  54. r"\1【【\2】】\3", con, flags=re.S) # 选项子母前面是图片 9/8
  55. if item_no_type == 2:
  56. if len(re.findall(r'【【[A-H][..、、]】】', con)) <= 3:
  57. while re.search(r"\n\s*\([A-H]\)\s*<imgsrc=.+?", con.replace(" ", "")): # 2020/7/15
  58. con = re.sub(r"\n\s*\(([A-H])\)\s*(<img src=.+?)", "\n" + r"【【\1、】】\2", con)
  59. while re.search(r"(\n\s*<imgsrc=.+?)(\([A-H]\)[..、、]?)(.+?)", con.replace(" ", "")):
  60. con = re.sub(r"(\n\s*<img src=.+?)\(([A-H])\)\s*[..、、]?(.+?)", r"\1" + "\n" + r"【【\2、】】\3", con)
  61. while re.search(r"(\n【【[A-H]、】】.+?)\(([A-H])\)[..、、]?\n+(.+?)\(([A-H])\)[..、、]?(.+?)",
  62. con.replace(" ", ""), re.S):
  63. con = re.sub(r"(\n\s*【【[A-H]、】】.+?)\(([A-H])\)\s*[..、、]?\s*\n+(.+?)"
  64. r"\(([A-H])\)\s*[..、、]?(.+?)", r"\1【【\2、】】\3【【\4、】】\5", con, flags=re.S)
  65. while re.search(r"(\n【【[A-H]、】】.+?)\(([A-H])\)[..、、]?\n+(.+?)", con.replace(" ", ""),re.S):
  66. con = re.sub(r"(\n\s*【【[A-H]、】】.+?)\(([A-H])\)\s*[..、、]?\s*\n+(.+?)",
  67. r"\1【【\2、】】\3", con, flags=re.S)
  68. while re.search(r"(\n【【[A-H]、】】.+?)\(([A-H])\)[..、、]?(.+?)", con.replace(" ", "")):
  69. con = re.sub(r"(\n\s*【【[A-H]、】】.+?)\(([A-H])\)\s*[..、、]?(.+?)", r"\1【【\2、】】\3", con)
  70. con_list = re.split(r"【【[A-H]\s*[..、、]】】", con)
  71. if len(con_list) > 1:
  72. stem_opt = table_label_cleal(con_list[0])
  73. con_list = list(map(del_table, con_list[1:]))
  74. con_list.insert(0, stem_opt) # 题干中的表格不需要清洗
  75. return con_list, con
  76. recur_n = 1 # 递归次数
  77. def option_structure(one_item, con, ans, item_no_type, is_danti=0, is_slave=0):
  78. """
  79. 选择题选项拆分结构化
  80. 还需要判断一下 选项个数与题型的对应!!!!
  81. :return:
  82. """
  83. global recur_n
  84. # print(con)
  85. # print('----------------------')
  86. if recur_n>2:
  87. if 'options' not in one_item and not is_slave:
  88. one_item["errmsgs"].append("选项格式不正确")
  89. recur_n = 1
  90. return one_item
  91. ans = re.sub("[;;.]+", "", ans)
  92. ans2 = []
  93. for a in ans.split("#"):
  94. if 0<len(a.replace(" ", "")) < 8:
  95. ans2.append("、".join(re.findall(r"[A-G]", a)))
  96. one_item["key"] = "; ".join(ans2)
  97. options_rank = get_options_arrange(one_item["stem"])
  98. # print("id:", one_item['item_id'])
  99. # print("options_rank:",options_rank)
  100. con_list, repl_con = option2block(con, item_no_type)
  101. # print(len(con_list), con_list)
  102. # 初筛
  103. if len(con_list) < 5:
  104. opt_letter = re.findall(r"【【([A-H])\s*[..、、]】】", repl_con)
  105. if opt_letter and opt_letter[0] == 'B' and re.search("<img src=.+?/>\s*A\s*[..、、].+?$", con_list[0]):
  106. re_split = re.sub("(<img src=.+?/>)\s*A\s*[..、、](.+?)$", r"\1【【A、】】\2", con_list[0])
  107. con_list[0] = re_split.split("【【A、】】")[0]
  108. con_list.insert(1, re_split.split("【【A、】】")[1])
  109. if len(con_list) >= 5:
  110. pattern_1 = re.compile(r"\s([1-9]|1[0-9])[..、、].+?([是为有]|等于)[((]\s*[))]\n", re.S)
  111. pattern_2 = re.compile(r"\s\(([1-9]|1[0-9])\).+?([是为有]|等于)[((]\s*[))]\n", re.S)
  112. pattern_3 = re.compile(r"([是为有]|等于)[((]\s*[))]\n", re.S)
  113. # 第一个错误针对题目中没有答案解析的情况,不然就是选项切分错误
  114. if not is_danti:
  115. if (item_no_type == 1 and any([True for op in con_list[1:] if re.search(pattern_1, op)])) or \
  116. (item_no_type == 2 and any([True for op in con_list[1:] if re.search(pattern_2, op)])):
  117. one_item["errmsgs"].append("本题选项与下一题题干间没有换行符,请注意重新换行!!!") # 一般只有一题和上一题连在一起
  118. if 'item_id' in one_item:
  119. one_item['spliterr_point'] = one_item['item_id']
  120. return one_item
  121. elif any([True for op in con_list[1:] if re.search(pattern_3, op)]):
  122. one_item["errmsgs"].append("本题的下一题的题号有问题,请注意重新输入!!!")
  123. if 'item_id' in one_item:
  124. one_item['spliterr_point'] = one_item['item_id']
  125. # ------------------------------------------------------------------------
  126. aft_opt = [] # 针对选项后是题目图片的情况
  127. if "\n" in con_list[-1]:
  128. ccon = re.split("\n+", con_list[-1])
  129. while re.match("<img src=", ccon[-1]) and len(ccon) > 1:
  130. aft_opt.insert(0, ccon[-1])
  131. ccon = ccon[:-1]
  132. if aft_opt:
  133. con_list[0] += "\n" + "\n".join(aft_opt)
  134. con_list[-1] = "\n".join(ccon)
  135. # -------------------------------------------------------------------------
  136. # 选项纠错
  137. con_list[0] = re.sub(r"\(\d+分\)", "", con_list[0][:9]) + con_list[0][9:]
  138. opt_letter = re.findall(r"【【([A-H])\s*[..、、]】】", repl_con)
  139. # print('/////////////////////////',opt_letter)
  140. if "".join(sorted(opt_letter)) in "ABCDEFGHIJ" or "".join(sorted(opt_letter)) in ["ABCE", "ABDE", "ACDE", "BCDE"]:
  141. # con_list = pic_transfer(con_list)
  142. if con_list:
  143. return dict(one_item, **{"stem": con_list[0],
  144. "options": [re.sub("(<br/>|\n)\s*$|\s+$", "", i) for i in con_list[1:]],
  145. "options_rank": options_rank,
  146. }) # , "options_num": len(con_list[1:])
  147. else:
  148. # 初次选项拆分的错误判断
  149. con_list = option_label_correct(opt_letter, con_list, repl_con)
  150. # double_l = [key for key, value in dict(Counter(opt_letter)).items() if value > 1]
  151. if type(con_list) == str:
  152. one_item["errmsgs"].append(con_list)
  153. return one_item
  154. else:
  155. # con_list = pic_transfer(con_list)
  156. if con_list:
  157. return dict(one_item,
  158. **{"stem": con_list[0],
  159. "options": [re.sub("(<br/>|\n)\s*$|\s+$", "", i) for i in con_list[1:]],
  160. "options_rank": options_rank,
  161. })
  162. # return dict(one_item, **dict(zip(["stem","A","B","C","D"], con_list)))
  163. else:
  164. # 选项可能放在表格中
  165. is_fail = 0
  166. con_list2 = re.split(r"\n+", con)
  167. errmsgs = ""
  168. if len(con_list2) == 2: # 选项是4个图片组成的情况
  169. option_array = len(re.findall("(^|\n)<img src=.+?", con_list2[1].strip()))
  170. if option_array > 2: # 排列情况
  171. options_rank = 1
  172. elif option_array > 1:
  173. options_rank = 3
  174. else:
  175. options_rank = 2
  176. ims = con_list2[1].split("<img src=")
  177. if len(ims) == 5 and re.search(r"[\u4e00-\u9fa5]", ims[0]) is None:
  178. con_list2 = [con_list2[0] if k == 0 else "<img src=" + v
  179. for k, v in enumerate(con_list2[1].split("<img src="))] # 默认将“<img src=”切分后的第一项丢掉了
  180. # if len(con_list2) == 5:
  181. con_list2[0] = re.sub(r"\(\d+分\)", "", con_list2[0].replace(" ", "")[:9]) + con_list2[0][9:]
  182. return dict(one_item, **{"stem": con_list2[0],
  183. "options": [re.sub("(<br/>|\n)\s*$|\s+$", "", i) for i in con_list2[1:]],
  184. "options_rank": options_rank,
  185. })
  186. else:
  187. errmsgs = """选项格式不正确,请改为: A.xxxx B.xxx 或 (A)xxxx (B)xxx,全文选项和题号格式要统一。
  188. 【注意】1>>选项和题干间要换行,选项不要放在表格中;2>>选项【如A.】重新手输;3>>选项图片时用嵌入式;
  189. 4>>选项太长时,每项之间要换行,上一项的内容不要与下一项在同一行!!"""
  190. is_fail = 1
  191. else:
  192. con_list3 = re.split(r"\n(?=<img)", con)
  193. if len(con_list3) == 5:
  194. return dict(one_item, **{"stem": con_list3[0],
  195. "options": [re.sub("(<br/>|\n)\s*$|\s+$", "", i) for i in con_list3[1:]],
  196. "options_rank": options_rank,
  197. })
  198. else:
  199. errmsgs = """选项格式不正确,请改为: A.xxxx B.xxx 或 (A)xxxx (B)xxx,全文选项和题号格式要统一。
  200. 【注意】1>>选项和题干间要换行,选项不要放在表格中;2>>选项【如A.】重新手输;3>>选项图片时用嵌入式;
  201. 4>>选项太长时,每项之间要换行,上一项的内容不要与下一项在同一行!!"""
  202. is_fail = 1
  203. op_con = re.split("[((]\s*[))]", con)[-1]
  204. stem_con = "".join(re.split("[((]\s*[))]", con)[:-1])+"( )\n"
  205. if is_fail:
  206. if "table" in op_con:
  207. to_clean_con = re.findall('<table>(((?!(</?table>)).)*)</table>', op_con, re.S)
  208. if len(to_clean_con) == 1:
  209. op_con = re.sub("</?table>|</?tr>|</?td>", "", op_con)
  210. one_item = option_structure(one_item, stem_con+op_con, ans, item_no_type)
  211. else:
  212. aa = re.findall("[A-E]", op_con)
  213. if len(aa)==len(set(aa)) == 4:
  214. recur_n += 1
  215. op_con = re.sub("(?<!\\\)([A-E])\s*(?![..、、])", r"\1、", op_con)
  216. one_item = option_structure(one_item, stem_con + op_con, ans, item_no_type, is_slave=is_slave)
  217. if not is_slave and 'options' not in one_item and "选项格式不正确" not in "".join(one_item["errmsgs"]):
  218. one_item["errmsgs"].append(errmsgs)
  219. return one_item
  220. def get_options_arrange(cont):
  221. """
  222. 判断word中选项每行排版个数
  223. :return:
  224. """
  225. options_rank = 1 # 纵向排列
  226. option_num = 0
  227. if '<table><tbody><tr>' in cont:
  228. table_op = re.findall('<tr>.+?>([A-H]\s*[..、、].+?|\([A-Z]\)\s*[..、、]?.+?)</tr>', cont.strip())
  229. if table_op:
  230. option_num = len(re.findall('[A-H]\s*[..、、].+?|\([A-Z]\)\s*[..、、]?.+?', table_op[0]))
  231. if option_num == 2:
  232. options_rank = 3
  233. if option_num > 2:
  234. options_rank = 2
  235. else:
  236. option_list = cont.split("\n")
  237. for op in option_list:
  238. if re.search("^\s*[A-H]\s*[..、、].+?|^\s*[A-H]\s*<img src=.+?|^\s*\([A-Z]\)\s*[..、、]?.+?", op.strip()):
  239. option_num += 1
  240. if option_num == 2:
  241. options_rank = 3 # 一排2个
  242. elif option_num < 2:
  243. options_rank = 2 # 横向排列
  244. return options_rank
  245. def new_options_rank(options):
  246. """
  247. 按提分宝产品B5纸176*250mm设置选项的排版形式
  248. 选项的排版形式暂设置3种:1:纵向排列 2:横向排列 3:一排2个
  249. 中文字符按5号字体,即10.5磅,英文字符按3/4个中文字符算
  250. :return:
  251. """
  252. options_rank = 1 # 纵向排列
  253. option_len = []
  254. for opt in options:
  255. if re.search("\$.*?\$", opt):
  256. return 0
  257. pic_len = []
  258. if "<img " in opt:
  259. for img in re.findall("<img src=.*?/>", opt):
  260. w_info1 = re.search(' style=".*?width: (\d+[.\d]*?)\s*([pxtin]*?);.*?"', img)
  261. w_info2 = re.search(' width="(\d+[.\d]*?)\s*([pxt]*?)"', img)
  262. if w_info1:
  263. if w_info1.group(2) == 'pt':
  264. pic_len.append((25.4/72)*float(w_info1.group(1)))
  265. elif w_info1.group(2) == 'px':
  266. pic_len.append((25.4 / 72) * (3/4) * float(w_info1.group(1)))
  267. elif w_info1.group(2) == 'in':
  268. pic_len.append(25.4 * float(w_info1.group(1)))
  269. elif w_info2:
  270. pic_len.append((25.4 / 72) * (3 / 4) * float(w_info2.group(1)))
  271. else:
  272. print("选项中存在图片宽高未知")
  273. # 主要没有宽高的图片是用户在编辑器新粘贴的图片,保存在本地,通过读取获取宽高
  274. w_info3 = re.search('<img src=.*?(/[^/]*?/new_image.*?)"', img)
  275. if w_info3:
  276. local_p = configs.IMG_FOLDER + w_info3.group(1)
  277. if os.path.exists(local_p):
  278. w = Image.open(local_p).size[0]
  279. pic_len.append((25.4 / 72) * (3 / 4) * float(w))
  280. else:
  281. print("选项中存在d的宽高未知图片不存在本地")
  282. options_rank = 0
  283. else:
  284. options_rank = 0
  285. opt = opt.replace(img, "")
  286. # 统计字符长度
  287. char_en_l = len(re.findall(r"[a-z\d,.!?;'\-/:<>=*+$~%()\[\]{}\" ]", opt))
  288. opt = re.sub(r"[a-z\d,.!?;'\-/:<>=*+$~%()\[\]{}\" ]", "", opt)
  289. char_zh_l = len(opt)
  290. char_len = (10.5/72)*25.4*(char_en_l*0.75+char_zh_l)
  291. option_len.append(sum(pic_len) + char_len)
  292. # 以最长的选项长度作为参考:<=6个中文字符则排成1行,<=15个中文字符则排成2排,否则都是纵向排列===>此逻辑不对
  293. if sum(option_len) + (len(options)*2 + (len(options)-1)*4)*(10.5/72)*25.4 < 176-40:
  294. options_rank = 2
  295. else:
  296. option_len = sorted(option_len, reverse=True)
  297. if option_len[0]+option_len[1] + (2*2+1*4)*(10.5/72)*25.4 <= 176-40:
  298. options_rank = 3
  299. return options_rank
  300. def option_label_correct(opt_letter, con_list, con):
  301. """
  302. 选项少切了会报错,所以优先解决多切的错误问题
  303. 纠正中标签错误的情况:选项字母不连续或重复;
  304. opt_letter:选项的字母 con_list:选择题拆分了选项的列表
  305. """
  306. lable_sign = re.findall(r"【【([A-H][..、、])】】", con.replace(" ", ""))
  307. con_list2 = con_list.copy()
  308. for i, j in enumerate(lable_sign): # 将con_list的选项字母加上
  309. con_list[i + 1] = j + con_list[i + 1]
  310. # con_list2 = re.split(r"【【[A-H]\s*[..、、]】】", con)
  311. p1 = 0 # 选项在con_list中的起始位置
  312. for k, v in enumerate(con_list[1:]):
  313. if re.search(r"[((]\s*[))]", v): # 选择题末尾一般都有()
  314. opt_letter[k] = '0'
  315. p1 = k + 2
  316. if p1 and p1 < len(con_list[1:]): # '0'不在最后一个位置
  317. option_list = con_list2[p1:]
  318. if len(option_list) >= 4:
  319. new_con_list = ["".join(con_list[:p1])].extend(option_list)
  320. return new_con_list
  321. else: # 只考虑ABCD和ACBD两种情况
  322. label_str = "".join(opt_letter)
  323. if re.match("A","".join(opt_letter)) is None:
  324. label_str = re.sub("[^A]A", "0A", "".join(opt_letter), count=1)
  325. # print(label_str)
  326. # -------------------------------------------------------------
  327. # 若选择题中没有(),题干中还是出现了AA的话,需要判断下是否存在错误
  328. if re.search("AA", label_str):
  329. label_bcd_idx = [k for k, i in enumerate(label_str) if i != 'A']
  330. label_a_idx = [k for k, i in enumerate(label_str) if i == 'A']
  331. length_all = []
  332. for i1 in label_bcd_idx: # 先将公式替换,作选项长度判断
  333. l1 = len(re.sub(r"<img\s*src\s*=\s*((?!/>).)+?/>", "<img>", con_list2[i1+1]).replace(" ",""))
  334. length_all.append(l1)
  335. aver_length = np.mean(length_all)
  336. st_a = label_str.index("AA")
  337. for i2 in label_a_idx:
  338. l2 = len(re.sub(r"<img\s*src\s*=\s*((?!/>).)+?/>", "<img>", con_list2[i2+1]).replace(" ",""))
  339. if abs(l2 - aver_length) >= 12:
  340. if i2 >= st_a:
  341. st_a = i2+1
  342. if st_a < len(label_str)-3:
  343. label_str = "".join(["0" if k < st_a else i for k, i in enumerate(label_str)])
  344. # -----------------------------------------------------------------
  345. label_str = re.sub("A[^BC]", "AA", label_str)
  346. label_str = re.sub("B[^CD]", "BB", label_str)
  347. label_str = re.sub("C[^BD]", "CC", label_str)
  348. label_str = re.sub("D[^E]", "DD", label_str)
  349. # 统计是否有重复的字符,若有,则进行合并,否则保持原来
  350. new_con_list = [con_list[0]]
  351. local_w = 0
  352. while local_w < len(label_str):
  353. if local_w == len(label_str) - 1 and label_str[local_w] == '0':
  354. break
  355. while label_str[local_w] == '0': # 如果‘0’在中间,则‘0’会被去除
  356. local_w += 1
  357. double_num = label_str.count(label_str[local_w])
  358. if double_num >= 2:
  359. new_con_list.append(con_list2[local_w + 1] + "".join(con_list[2 + local_w:local_w+double_num + 1]))
  360. else:
  361. new_con_list.append(con_list2[local_w + 1])
  362. local_w += double_num
  363. new_opt_letter = label_str.replace('AA',"A").replace('BB',"B").replace('CC',"C").replace('DD',"D")
  364. if len(new_con_list) >= 4:
  365. if "".join(sorted(new_opt_letter)) in "ABCDEFGHIJ" or "".join(sorted(new_opt_letter)) in ["ABCE", "ABDE", "ACDE", "BCDE"]:
  366. return new_con_list
  367. return "选项格式不正确,1、请改为: A.xxxx B.xxx,手动输入选项字母及后面的标点符号;" \
  368. "2.第一个选项A与题干之间要换行,各选项按ABCD排序;3.选项含图片时用嵌入式;"
  369. def table_option_struc(stem):
  370. """
  371. 表格类的选项结构化,在化学科目的选择题中较常见
  372. :return: 表格仍然作为表格,选项则根据表格中的选项补充,如A、A B、B
  373. """
  374. options = []
  375. may_options = re.findall("<table>(((?!(</?table>)).)*)</table>", stem)
  376. if may_options:
  377. options_data = may_options[-1][0]
  378. data_col = re.findall("<tr><td>(.*?)</td>", options_data) # 第一列
  379. if re.search("#?A#B#C#D#", re.sub("[..、、,,\s]", "", "#".join(data_col).strip())+"#"):
  380. options_str = re.sub("[..、、,,\s]", "", "#".join(data_col).strip()+"#")
  381. if "A#B#C#D#E#F#" not in options_str:
  382. if "A#B#C#D#E#" in options_str:
  383. options = ["A", "B", "C", "D", "E"]
  384. elif "A#B#C#D#" in options_str:
  385. options = ["A", "B", "C", "D"]
  386. else:
  387. data_rows = re.findall("<tr>(.*?)</tr>", options_data)
  388. data_row = re.findall("<td>(.*?)</td>", data_rows[0]) # 第一行
  389. if re.search("#?A#B#C#D#", re.sub("[..、、,,\s]", "", "#".join(data_row).strip()) + "#"):
  390. options_str = re.sub("[..、、,,\s]", "", "#".join(data_row).strip() + "#")
  391. if "A#B#C#D#E#F#" not in options_str:
  392. if "A#B#C#D#E#" in options_str:
  393. options = ["A", "B", "C", "D", "E"]
  394. elif "A#B#C#D#" in options_str:
  395. options = ["A", "B", "C", "D"]
  396. return options
  397. if __name__ == '__main__':
  398. stem ="""
  399. 下列物质与危险化学品标志的对应关系不正确的是<br/><table><tr><td>A</td><td>B</td><td>C</td><td>D</td></tr><tr><td>汽油</td><td>天然气</td><td>浓硫酸</td><td>氢氧化钠</td></tr><tr><td><img src="files/image2.png" width="125px" height="116px" /></td><td><img src="files/image3.png" width="117px" height="117px" /></td><td><img src="files/image4.png" width="118px" height="119px" /></td><td><img src="files/image5.png" width="122px" height="118px" /></td></tr></table>
  400. """
  401. print(table_option_struc(stem))