123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506 |
- #!/usr/bin/env/python
- # -*- coding:utf-8 -*-
- # 本文件包含以下函数
- # stem_ans_split:将切出来的一道题 按答案解析 进一步细分
- # correct_wrong_no :针对分错的题号进行 纠正 或 报错
- # stems_structure_byno:按题号进行切分;
- # dati2slave :带小问的大题 按小问切分
- # split2little_con: 将带小问的填空题或解答题 按 小问 继续划分,小问已切分好
- # get_options_arrange: 判断word中选项每行排版个数
- import re
- from washutil import table_label_cleal
- from ans_structrue import only_parse_split, get_ans_from_parse
- from pprint import pprint
- from collections import Counter
- def stem_ans_split(one_item_dict, case):
- """
- 将切出来的一道题 按 答案解析 进一步细分
- :param one_item_dict: 单道题的初步结构字典{"content": , "item_id": , "errmsgs": [],"item_topic_name":,}
- :param case: 属于哪种情况
- :return: {"content": ,"answer": ,"parse":}
- """
- one_item = one_item_dict["content"]
- item_type = one_item_dict["item_topic_name"]
- # print(one_item)
- if case == 'case0': # 没“答案”关键字
- inside_split = re.split(r"【(解析|解答|分析|详解|点评|点睛|考点|专题)】\n*?",
- table_label_cleal(one_item))
- inside_split = ['【' + a + '】' if str(a).strip() in ['解答', '分析', '解析', '详解', '点评', '点睛']
- else str(a).replace('None', '').strip() for a in inside_split]
- # print(':::', inside_split)
- # print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')
- dd = dict(zip(["content", "parse_title"], inside_split[0:2]))
- dd["parse"] = str(dd["parse_title"]) + "\n".join(inside_split[2:]).replace("\n\n", "\n")
- dd["parse"] = re.sub(r"^\s*【解析】", "", dd["parse"])
- dd["answer"] = ""
- else: # if case == 'case1': # 有“答案”关键字
- dd = dict(zip(["content", "answer"], re.split(r"【答案】\n?",
- table_label_cleal(one_item), maxsplit=1)))
- # pprint(dd) # 一般默认‘答案’在‘解析’的前面
- subdd = dict(zip(["answer", "parse_title", "parse"],
- re.split(r"【(解析|解答|分析|详解|点评|点睛)】\n?", dd["answer"], maxsplit=1)))
- dd["answer"] = subdd["answer"]
- if "parse_title" in subdd:
- dd["parse"] = "【" + subdd["parse_title"] + "】" + subdd["parse"]
- dd["parse"] = re.sub(r"^\s*【解析】", "", dd["parse"])
- else:
- dd["parse"] = ""
- dd["content"] = re.sub(r"[1-9][0-9]?\s*[..、、]", "", dd["content"][:5]) + dd["content"][5:]
- # 获取答案
- if not dd["answer"]:
- dd["answer"] = get_ans_from_parse(dd["parse"], item_type, dd["content"])
- # 补充!!!------------------------------------------
- # if item_type in ["单选题", "多选题", "选择题"]: # (故选[::]([A-Z;;和与、、]+)|
- # ans = re.search(r'故选[::]?<imgsrc=[^>]+?data-latex="\$?([A-Z;;和与、、\s]+)\$?".+?/>|故选[::]?([A-Z;;和与、、\s]+)',
- # dd["parse"].replace("$", "").replace(" ", ""))
- # if ans:
- # dd["answer"] = ans.group(1) if ans.group(1) is not None else ans.group(2) # ans.group(1) != None
- # else:
- # dd["answer"] = ""
- # else:
- # dd["answer"] = "见解析"
- # ans = re.search(r'故\s*[::]?\s*答案分?别?[为是]?\s*[::]?\s*(.+?)[..]\s*\n', dd["parse"])
- # if ans:
- # dd["answer"] = ans.group(1)
- # ------------------------------------------------------
- if "parse_title" in dd:
- del dd["parse_title"]
- return dd
- def stem_ans_split2(one_type_list, idx1, idx2, item_type, case):
- """
- 将切出来的一道题 按答案解析 进一步细分
- :param one_type_list: 一类题文的list
- :param idx1:题目开头,包含
- :param idx2:下一题开头
- :param item_type:题型
- :param case: 属于哪种情况
- :return:{"content": ,"answer": ,"parse":}
- """
- one_item = one_type_list[idx1:idx2]
- if idx2 == -1:
- one_item = one_type_list[idx1:]
- if case == 'case1': # 没“答案”关键字
- inside_split = re.split(r"【(解析|解答|分析|详解|点评|点睛|考点|专题)】\n*?",
- table_label_cleal("\n".join(one_item)))
- inside_split = ['【' + a + '】' if str(a).strip() in ['解答', '分析', '解析', '详解', '点评', '点睛']
- else str(a).replace('None', '').strip() for a in inside_split]
- dd = dict(zip(["content", "parse_title"], inside_split[0:2]))
- dd["parse"] = str(dd["parse_title"]) + "\n".join(inside_split[2:]).replace("\n\n", "\n")
- else:
- dd = dict(zip(["content", "answer"], re.split(r"【答案】\n?|答案\s*[::]\n?",
- table_label_cleal("\n".join(one_item)), maxsplit=1)))
- subdd = dict(zip(["answer", "parse_title", "parse"],
- re.split(r"【(解析|解答|分析|详解|点评|点睛)】\n?|(解析|解答|分析|详解|点评|点睛)\s*[::]", dd["answer"], maxsplit=1)))
- dd["answer"] = subdd["answer"]
- if "parse_title" in subdd:
- dd["parse"] = "【" + subdd["parse_title"] + "】" + subdd["parse"]
- dd["parse"] = re.sub(r"^\s*【解析】", "", dd["parse"])
- dd["content"] = re.sub(r"[1-9][0-9]?\s*[..、、]", "", dd["content"][:5]) + dd["content"][5:]
- dd["item_topic_name"] = item_type if re.sub('[((]', "", item_type) != '本大题' else "解答题"
- if item_type in ["单选题", "多选题", "选择题"]: # (故选[::]([A-Z;;和与、、]+)|
- ans = re.search(r'故选[::]?<imgsrc\d+data-latex="([A-Z;;和与、、\s]+)"/>|故选[::]?([A-Z;;和与、、\s]+)',
- dd["parse"].replace("$", "").replace(" ", ""))
- if ans:
- dd["answer"] = ans.group(1) if ans.group(1) is not None else ans.group(2) # ans.group(1) != None
- else:
- dd["answer"] = ""
- else:
- dd["answer"] = "见解析"
- ans = re.search(r'故\s*[::]?\s*答案分?别?[为是]?\s*[::]?\s*(.+?)[..]\s*\n', dd["parse"])
- if ans:
- dd["answer"] = ans.group(1)
- del dd["parse_title"]
- return dd
- # def correct_wrong_no(con_list, items_no, item_no_type):
- # """
- # 针对分错的题号进行纠正 ;;带解析的划分题目最好按关键字拆分!!!!
- # 题号划分错误有:题号重复,题号遗漏,题号偏离很远的错误如88.等
- # 无题型行时,con_list中每个元素代表每一行
- # 有题型行时,con_list中每个元素代表每个题型中的所有题目
- # items_no:初步找到的所有题号
- # :return: con_list
- # """
- # # items_no = [1,2,3,4,5,6,7, 8, 9, 10, 11, 6, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
- # seq_no = find_seq_num(items_no) # 找到连续的分组
- # print("items_no:", items_no)
- # print("seq_no:", seq_no)
- #
- # err_no_idx = {} # 分错的分组序号和错误题号,主要针对2个以内成组的序号
- # double_no = [] # 针对2个以上成组,且重复序号 分错的情况
- # omit_no = [] # 因没有换行或无题号导致 没有 切分出来的题号
- # right_no_list = []
- # if len(seq_no) > 1: # 存在分断或分错的地方
- # print('按题号切分的过程中,存在分断或分错的地方')
- # right_no = [i for i in seq_no if len(i) > 2]
- # if len(find_seq_num(sum(right_no, []))) == 1: # 2个以上成的所有组是连续的
- # # 题号序列异常值判断
- # right_seq = del_exception_value(items_no) # 主要去掉异常的大值
- # # print("right_seq:",right_seq)
- # right_max_v = -1
- # if not right_seq:
- # right_max_v = max(items_no)
- # else:
- # right_max_v = right_seq[-1]
- # # print("right_max_v:", right_max_v)
- # if sum(right_no, [])[0] == 1 and sum(right_no, [])[-1] == right_max_v: # 题号从1开始
- # # [1,2,3,4,5,6,7, 8, 9, 10, 11, 6, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
- # right_no_list.extend([i for k, i in enumerate(seq_no) if len(i) > 2])
- # err_no_idx.update({k: i for k, i in enumerate(seq_no) if len(i) <= 2}) # 出现重复题号
- # else: # 说明左右两边有遗漏
- # # [[1, 2], [4, 5], [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]]
- # # [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], [20, 21]]
- # # [[1, 2], [4, 5], [7, 8, 9, 10, 11], [6], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21]]
- # # todo_no = [i for i in seq_no if len(i) <= 2]
- # right_no_idx = [k for k, i in enumerate(seq_no) if len(i) > 2]
- # if seq_no[:right_no_idx[0]]: # k>=1 左边有遗漏
- # que_no = set(range(1, sum(right_no, [])[0])) - set(sum(seq_no[:right_no_idx[0]], []))
- # omit_no.extend(list(que_no))
- # elif len(right_no_idx) == 1 and seq_no[right_no_idx[0]+1:]: # 右边有遗漏
- # que_no = set(range(sum(right_no, [])[-1]+1, right_max_v)) - set(sum(seq_no[right_no_idx[0]+1:], []))
- # omit_no.extend(list(que_no))
- # # print("omit_no:",omit_no)
- # # 既遗漏又有重复的错误不同时考虑!!!!,先报遗漏错误,教师修改后再对重复部分进行纠正
- # else:
- # # 存在题号错误:一种是与正确的重复,另一种是与序号偏离的很远,如81,目前是暂定取99内的数字作为序号
- # # [[1, 2], [4, 5, 6, 7, 8, 9, 10, 11], [13, 14], [16, 17, 18, 19, 20, 21]]
- # # [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [13, 14], [16, 17, 18, 19, 20, 21]]
- # num_count = Counter(items_no)
- # # print("num_count:",num_count)
- # if len(set(num_count.values())) > 1:
- # print("存在{题号重复}的切分错误")
- # for k, v in num_count.items():
- # if v >= 2: # 重复2次以上
- # # print(items_no.index(k)) # 只能获取第一个元素的索引值
- # v2_index = [index for (index, value) in enumerate(items_no) if value == k][1:] # 重复序号的索引
- # # 判断重复序号哪个是错误的,这里没有考虑题号遗漏的情况
- # if v2_index[0]+items_no[0] > k: # 位置 > 序号, 一般要求题号从1开始
- # for subi in v2_index:
- # # print(subi, k)
- # double_no.append((k, 'xiao'))
- # del items_no[subi]
- # if v2_index[0]++items_no[0] < k: # 位置 < 序号
- # for subi in v2_index:
- # double_no.append((k, 'da'))
- # del items_no[subi]
- #
- # else: # 存在题号遗漏
- # print("存在题号遗漏")
- # for k, i in enumerate(right_no):
- # if k == 0:
- # if i[0] == 2:
- # omit_no.append(1)
- # if i[0] > 2:
- # omit_no.append("1~"+str(i[0]-1))
- # if 0 < k < len(right_no):
- # omit_no.extend(list(range(right_no[k-1][-1]+1, i[0])))
- # # if omit_no:
- # # return "第" + ",".join(map(str, omit_no)) + "题的格式是否正确,不要放在表格中,且要求题号从1开始并连续;" \
- # # "若格式正确,请将第" + ",".join(map(str, omit_no)) + "题的题号(包括题号后的标点符号)重新手输且与上一题重新换行"
- #
- # if double_no and len(find_seq_num(items_no)) == 1:
- # # 在分错题号前加标识
- # all_con = "@@\n" + "@@\n".join(con_list)
- # for db in double_no:
- # may_no_st = re.search(r"\n\s*" + str(db[0]) + r'\s*([..、、].+?)',
- # all_con, re.S).start() # 分错位置在全文中的索引
- # if item_no_type == 2:
- # may_no_st = re.search(r"\n\s*[((]\s*" + str(db[0]) + r'\s*[))]\s*([..、、]?.+?)',
- # all_con, re.S).start() # 分错位置在全文中的索引
- # if db[1] == 'xiao': # 重复的切分错误的序号在正确的后面,第一个匹配到的是正确的
- # # all_con = all_con[:may_no_st] + re.sub(r"\s+((?!src).)+?", r"\1", all_con[may_no_st:][:15]) + all_con[may_no_st:][15:]
- # # 该正则表示空格后面是src字符串时,空格保留;最开始时图片已做过替换,这里也可以去掉图片信息中的空格
- #
- # err_no_st = re.search(r"\n\s*" + str(db[0]) + r'\s*([..、、].+?)',
- # all_con[may_no_st+10:], re.S).start() # 分错位置在全文中的索引
- # if item_no_type == 2:
- # err_no_st = re.search(r"\n\s*[((]\s*" + str(db[0]) + r'\s*[))]\s*([..、、]?.+?)',
- # all_con[may_no_st + 10:], re.S).start() # 分错位置在全文中的索引
- # # print("err_no_st:", err_no_st, all_con[may_no_st + err_no_st+10:may_no_st + err_no_st+20])
- #
- # all_con = all_con[:may_no_st + err_no_st + 11] + "【fei】" \
- # + all_con[may_no_st + err_no_st + 11:] # 在分错题号前加标识
- #
- # if db[1] == 'da': # 重复的切分错误的序号在正确的前面,第一个匹配到的是错误的
- # all_con = all_con[:may_no_st + 1] + "【fei】" \
- # + all_con[may_no_st + 1:] # 在分错题号前加标识
- # # print("all_con:",all_con)
- # con_list = all_con.split("@@\n")[1:]
- #
- # # 针对2个以内成组的序号 加错误标识
- # sorted_idx = sorted(err_no_idx.keys(), reverse=False) # 对字典按索引位置排序
- # print("err_no_idx:", err_no_idx, "sorted_idx:", sorted_idx)
- # if err_no_idx:
- # if sorted_idx[0] > 0:
- # all_con = "@@\n" + "@@\n".join(con_list)
- # st_flag = str(seq_no[sorted_idx[0] - 1][-1]) # 分错位置的前一个题号
- # # 分错位置的前一个题号在全文中的索引
- # # if err_no_idx[sorted_idx[0]][0] == int(st_flag):
- # # return st_flag + "题题号出现重复"
- # st_flag_index = re.search(r"\n+\s*" + st_flag + r'\s*([..、、].+?)', all_con, re.S).start()
- # if item_no_type == 2:
- # st_flag_index = re.search(r"\n+\s*[((]\s*" + st_flag + r'\s*[))]\s*([..、、]?.+?)', all_con, re.S).start()
- # for k in sorted_idx: # 遍历键
- # for subk in err_no_idx[k]: # 遍历 键 的值
- # # print('*****************')
- # # print("st_flag:", st_flag, '---subk:', subk)
- # # print("st_flag_index:",st_flag_index)
- # err_no_st = re.search(r"\n\s*" + str(subk) + r'\s*([..、、].+?)',
- # all_con[st_flag_index:], re.S).start() # 分错位置在全文中的索引
- # if item_no_type == 2:
- # err_no_st = re.search(r"\n\s*[((]\s*" + str(subk) + r'\s*[))]\s*([..、、]?.+?)',
- # all_con[st_flag_index:], re.S).start() # 分错位置在全文中的索引
- # all_con = all_con[:st_flag_index + err_no_st + 1] + "【fei】" \
- # + all_con[st_flag_index + err_no_st + 1:] # 在分错题号前加标识
- # con_list = all_con.split("@@\n")[1:]
- # else: # 拿到了前面不是题号的序号 [27, 27, 1, 2, 3, 4, 5, 6, 7]
- # all_con = "@@\n" + "@@\n".join(con_list)
- # if items_no.count(1) == 1:
- # con_1 = re.split(r"@@\n\s*1\s*[..、、]", all_con)[1]
- # con_list = ("1、"+con_1).split("@@\n") # right_no_list = sum(right_no_list, [])
- # # right_no_list = str(right_no_list).replace("[", "").replace("]", "").replace(" ", "").split(",")
- #
- # # con_list = re.split(r"\n\s*("+ r"|".join(right_no_list) + ")\s*[..、、]", all_con)[1:]
- # # if len(con_list) > 1:
- # # con_list = [con for k, con in enumerate(con_list) if k % 2 == 1]
- # return con_list
- def split2one_item(con_list):
- """
- 第一种试卷格式:教师用卷,含答案和解析关键字
- 输入html文件,先按大题将 一篇文档分开
- 切分思路:
- 1.按空行分割,首先将【答案】,【解析】,<img src=<img src="files/image\d+.png">前面的空行<p> </p>删掉,然后直接按<p></p>来split
- 格式要求:每小题 21. 数字+英文点号 大题:中文 一二三四+中文顿号
- :return:
- """
- # item_no_type = 1
- # # all_con = table_label_cleal("\n" + "\n".join(con_list))
- # # item_no = [int(no) for no in re.findall(r'\n+\s*([1-9][0-9]?)\s*[..、、]', all_con)]
- # # if len(item_no) <= 2:
- # # item_no_type = 2
- # # item_no = [int(no) for no in re.findall(r'\n+\s*[((]\s*([1-9][0-9]?)\s*[))]\s*[..、、]?', all_con)]
- # # if len(item_no) > 3:
- # # all_con = re.sub(r'\n\s*\(([1-9][0-9]?)\)\s*[..、、]?', "\n" + r"【@\1、", all_con)
- # # con_list = all_con.replace("【@", "").split("\n")[1:]
- # # ----------------------------------------------------------------------------
- # # 去掉多余空格,作用不大
- # con2 = ["【delete】" if (k < len(con_list) - 1 and v.strip() == "" and (
- # re.match(r"【(答案|解析)】|(答案|解析)\s*[::]|<imgsrc\d+|\s+", con_list[k + 1].strip()) or
- # re.match(r"(([1-9]|[1-4][0-9])\s*[..、、]|[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题)",
- # con_list[k + 1].strip()) is None))
- # or (k > 0 and v.strip() == "" and (
- # re.match(r"【(答案|解析)】$|(答案|解析)\s*[::]", con_list[k - 1].strip()) or
- # re.match(r"[a-z<>/\s]*[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题",
- # con_list[k - 1].strip())))
- # else v for k, v in enumerate(con_list)]
- # con3 = list(filter(lambda x: x != "【delete】", con2))
- # while len(con3) > 0:
- # if con3[-1].strip() == "":
- # del con3[-1]
- # if con3[0].strip() == "":
- # del con3[0]
- # con3.append("") # 不然最后一个题就漏掉了
- #
- # # 开头没用信息处理
- # con3[0] = re.sub(r"([一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题)", r"\n\1", con3[0])
- # while con3 and (re.search(r"[\u4e00-\u9fa5]", con3[0]) is None
- # or re.search(r"[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题", con3[0]) is None):
- # del con3[0]
- #
- # # ----------------------解析 方案【1】-------------------------------------------------------------
- # # 根据大题型分,再按【答案|解析】初步拆分题目,再在‘解析’和‘答案’间细分‘题干’和‘解析’
- # # 1、获取题型行信息、按题型行切分
- # con4, all_type_info, all_type, each_item_score, each_item_score2, select_type_id, choice_class \
- # = get_item_head_info("\n" + "\n".join(con3))
- #
- # # 2、据是否有题型行分两步进行
- # res = []
- # if not all_type:
- # print("不存在大题题型行或题型行格式有问题")
- # return "不存在大题题型行或题型行格式有问题,请检查" # 放第【2】种方案中进行处理
- # else:
- # if len(all_type) != len(con4):
- # print("存在题型行没有换行")
- # return "存在题型行末尾没有换行,请在所有题型行末尾重新换行" # 放第【2】种方案中进行处理
- # else:
- # # if "非选择题" in all_type:
- # # return "第" + str(all_type.index("非选择题")+1) + "大题的题型不明确"
- # index = 0
- # for num, one_type in enumerate(con4):
- # count = 1
- # if len(re.findall(r"\n\s*【答案】", one_type)) == len(re.findall(r"\n\s*【解析】", one_type)):
- # subcon = re.split(r"((?<=\n)\s*【答案】|(?<=\n)\s*【解析】)\n?", one_type.strip())
- # # index根据第一道题的题号进行纠正
- # st_pat = re.match(r"([1-9]|[1-6][0-9])\s*[..、、].+?", subcon[0].strip())
- # if st_pat and num == 0:
- # st_id = st_pat.group(1)
- # if int(st_id) != 1:
- # index = int(st_id) - 1
- #
- # if len(subcon) == 5: # 只有1道题
- # dd = dict(zip(["content", "answer", "parse"],
- # re.split(r"(?<=\n)\s*【答案】|(?<=\n)\s*【解析】", table_label_cleal(one_type))))
- # dd["item_topic_name"] = all_type[num]
- # dd["content"] = re.sub(r"\d+\s*[..、、]", "", dd["content"][:5]) + dd["content"][5:]
- # dd["score"] = each_item_score[num]
- # dd["errmsgs"] = []
- # dd["item_id"] = count + index
- # if not dd["score"] and each_item_score2 and str(dd["item_id"]) in each_item_score2.keys():
- # dd["score"] = each_item_score2[str(dd["item_id"])]
- # if select_type_id and dd["item_id"] in select_type_id:
- # dd['is_optional'] = 'true'
- # res.append(dd)
- # # count += 1
- # else:
- # # ------在下一题【解析】在本题【答案】之间找到下一题【content】的位置--------
- # for id in range(len(subcon)):
- # if re.match(r"\n*\s*【解析】", subcon[id]) and id < len(subcon) - 2: # 不是最后一个解析,倒数第二个是最后一个解析
- # count += 1
- # ssub = subcon[id + 1].strip().split("\n") # 首尾空行先去掉
- # blank_line = [i for i, v in enumerate(ssub) if v.strip() == ""] # 空格索引
- # # 索引to题号字典
- # con_id_line_dict = {i: re.match(r"([1-9]|[1-6][0-9])\s*[..、、]", v.strip()).group(1)
- # for i, v in enumerate(ssub)
- # if re.match(r"([1-9]|[1-6][0-9])\s*[..、、]", v.strip())}
- # # print("con_id_line_dict",con_id_line_dict)
- # con_id_line = list(con_id_line_dict.keys()) # 行索引,第几行
- # topicno = list(con_id_line_dict.values()) # 题号序列
- # topicno_line_idx = dict(zip(topicno, con_id_line)) # 题号to行索引字典
- # if len(con_id_line) != len(topicno_line_idx):
- # return all_type[num] + "第" + str(count) + "道题(在整篇文档中为第" + str(
- # index + count) + "题)的题文和上一题的解析之间出现【多个相同的题目序号】,请重新确认!"
- # else:
- # if len(blank_line) == 1 and len(con_id_line) == 1: # 一般情况只有一个空行
- # if con_id_line[0] > blank_line[0]:
- # ssub.insert(con_id_line[0], "【content】")
- # else:
- # if str(count + index) == topicno[0]: # 该题的序号正确,优先按序号拆
- # ssub.insert(con_id_line[0], "【content】")
- # else:
- # ssub[blank_line[0]] = "【content】" # 该题序号不对时再考虑空行
- # elif len(blank_line) != 1:
- # if len(con_id_line) >= 1: # 优先考虑题目序号,多个序号时
- # # ssub.insert(con_id_line[-1], "【content】") # 默认最后一个,很粗糙
- # if str(count + index) in topicno:
- # ssub.insert(topicno_line_idx[str(count + index)], "【content】")
- # else:
- # return all_type[num] + "第" + str(count) + "道题(在整篇文档中为第" + str(
- # index + count) + "题)的题文和上一题的解析之间出现【题目序号不连续】,请检查该题目序号并重新手输!"
- # elif len(blank_line) > 1: # 题目序号有误,多个空行时
- # # ssub[blank_line[-1]] = "【content】"
- # return all_type[num] + "第" + str(count) + "道题(在整篇文档中为第" + str(
- # index + count) + "题)的题文和上一题的解析之间出现【题目序号有误】,请将题目序号重新手输!"
- # else: # 无序号,无空行
- # return all_type[num] + "第" + str(count) + "道题(在整篇文档中为第" + str(
- # index + count) + "题)的题文和上一题的解析之间出现【题目序号或空行都有误】,请将题目序号重新手输并查看空行!"
- # # 如果存在空行有误,且题目序号有误时,那基本就会拆分错误
- # else: # len(con_id_line)!=1
- # if not con_id_line: # 一个空行,没有序号时
- # # ssub[blank_line[0]] = "【content】"
- # return all_type[num] + "第" + str(count) + "道题(在整篇文档中为第" + str(
- # index + count) + "题)的题文和上一题的解析之间出现【题目序号有误】,请将题目序号重新手输!"
- # else: # 1个空行,多个序号时
- # print(all_type[num], "第", count, "道题的题文和上一题的解析之间存在【多个题目序号】")
- # if str(count + index) in topicno:
- # ssub.insert(topicno_line_idx[str(count + index)], "【content】")
- # else:
- # return all_type[num] + "第" + str(count) + "道题(在整篇文档中为第" + str(
- # index + count) + "题)的题文和上一题的解析之间出现【题目序号不连续】,请检查该题目序号并重新手输!"
- # # ssub.insert(con_id_line[-1], "【content】") # 须优化
- # subcon[id + 1] = "\n".join(ssub)
- # # ----------------------------------------------------------------
- # all_item = re.split(r"【content】", "\n".join(subcon).strip())
- # for idk, one_item in enumerate(all_item):
- # dd = dict(zip(["content", "answer", "parse"],
- # re.split(r"(?<=\n)\s*【答案】\n?|(?<=\n)\s*【解析】\n?",
- # table_label_cleal(one_item))))
- # dd["item_topic_name"] = all_type[num]
- # dd["content"] = re.sub(r"\d+\s*[..、、]", "", dd["content"][:5]) + dd["content"][5:]
- # dd["score"] = each_item_score[num]
- # dd["errmsgs"] = []
- # dd["item_id"] = idk + 1 + index
- # if choice_class:
- # for k, v in choice_class.items():
- # if count + index in v:
- # dd["item_topic_name"] = k + "选题"
- # elif len(choice_class) == 1:
- # dd["item_topic_name"] = "多选题" if k == "单" else "单选题"
- # if not dd["score"] and each_item_score2 and str(dd["item_id"]) in each_item_score2.keys():
- # dd["score"] = each_item_score2[str(dd["item_id"])]
- # if select_type_id and dd["item_id"] in select_type_id:
- # dd['is_optional'] = 'true'
- # res.append(dd)
- # # pprint(res)
- # # print('------------------')
- # else:
- # # print("【答案】或【解析】格式有误")
- # return "第" + str(num + 1) + "大题《" + all_type[num] + "》中【答案】或【解析】格式有误或其中某道题中出现多个相同关键字或漏关键字"
- # index += count
- # return res, item_no_type
- #
- # def only_parse_split(one_item_ans, item_type, reparse_n = 1):
- # """
- # 拆分出答案和解析
- # :one_item: 一道题的答案解析部分,
- # :return:{'answer': ,"parse": }
- # """
- # dd = {'parse': one_item_ans, 'answer': ""}
- # simp_item = re.sub("(【([解分][析答]|详解|点[评睛])】|答案|解析|详解)\s*[::]?", "", one_item_ans)
- # simp_item = re.sub("[^\u4e00-\u9fa5∵∴]", "", simp_item)
- # if len(simp_item) < 10 and re.search("因为?|因此|所以|根据|依据|若|假设", simp_item) is None:
- # dd['parse'] = ""
- #
- # if re.search(r"【(解析|解答|分析|详解|点评|点睛)】\n?|(解析|解答|分析|详解|点评|点睛)\s*[::]", one_item_ans):
- # dd1 = dict(zip(["answer", "parse_title", "parse"],
- # re.split(r"【(解析|解答|分析|详解|点评|点睛)】\n?", one_item_ans, maxsplit=1)))
- # dd["parse"] = "【" + dd1["parse_title"] + "】" + dd1["parse"]
- # del dd1["parse_title"]
- #
- # if item_type in ["单选题", "多选题", "选择题", "单项选择", "多项选择"]:
- # ans = re.search(r'故选\s*[::]\s*<img src=[^>]+?data-latex="([A-Z;;和与、、\s]+)".+?/>|故选\s*[::]?\s*([A-Z;;和与、、\s]+)',
- # dd["parse"].replace("$", ""))
- # if ans:
- # dd["answer"] = ans.group(1) if ans.group(1) is not None else ans.group(2)
- # elif not dd['answer']:
- # dd['answer'] = one_item_ans.strip()
- # dd['answer'] = re.sub("[.;;.]", "", dd['answer'])
- # else:
- # ans1 = re.search(r'故\s*[::]?\s*(答案分?别?[为是]?|填)\s*[::]?\s*(.+?)[..]\s*(\n|$)', dd["parse"])
- # ans2 = re.search(r'故\s*[::]?\s*(答案分?别?[为是]?|填)\s*[::]?\s*(<img src=.+?/>)[..]?\s*(\n|$)', dd["parse"])
- # if reparse_n != 2 and "【答案】" not in one_item_ans and \
- # len(re.findall(r"[((]\d[))]|[\n::;;。】]([((](i{1,3}|[ⅰⅱⅲⅳⅠⅡⅢIV①②③④])[))]|[①②③④])",
- # one_item_ans.replace(" ", ""))) > 1:
- # dd["answer"] = "见解析"
- # elif ans1:
- # dd["answer"] = ans1.group(2)
- # elif ans2:
- # dd["answer"] = ans2.group(2)
- # elif not dd['parse']:
- # dd['answer'] = one_item_ans.strip()
- # else:
- # dd["answer"] = "见解析"
- #
- # return dd
|