12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256 |
- #!/usr/bin/env/python
- # -*- coding:utf-8 -*-
- """
- """
- import re
- from utils.washutil import table_label_cleal, del_no
- from utils.topic_no import *
- def anss_structure_with_type(item_res, ans_str, all_type, all_type2,item_type_num,item_type_classify):
- """
- 针对含题型行的答案进行结构化
- :return:
- """
- rd1_is_fail = 0
- is_amount_wrong = 0
- if len(all_type2) == len(all_type):
- # try:
- if not all_type[0]: # 第一大题型不存在
- all_type[0] = all_type2[0]
- if all_type.count(all_type[0]) > 1:
- item_type_classify[all_type[0]] += item_type_num[0][1]
- if sorted(all_type) != sorted(all_type2): # !!!!!!还需细分
- print("题文和答案中的题型不一致,题文中的题型={},答案中的题型={}".format(str(all_type), str(all_type2)))
- ans22 = re.split(r"\n\s*[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题.*?[分。.]?\s*(?=\n)", ans_str)
- while re.search(r"[A-H\u4e00-\u9fa5]", ans22[0]) is None:
- del ans22[0]
- ans_item_no_type = judge_ans_no_type(ans22, item_type_num) # 初步判断答案中的题号类型
- # print("tihao类型:", ans_item_no_type)
- # -----------------------------------------------------------------
- # 开始按题号切分题目!!!!!!!!!!!!!!!
- type_id = 0
- new_ans_no = []
- all_ans = []
- for num, one_type in enumerate(ans22):
- # 当前题型中的题目数量(以试题的题目个数为准)
- one_type_num = item_type_classify[all_type[num]]
- if len(all_type2) != len(item_type_classify): # 存在重复题型
- one_type_num = item_type_num[num][1]
- print("one_type_num :", one_type_num)
- # 1>> 先按题号都在行首的情况(一行不多于一个答案)
- item_id = 0
- one_type_ans = []
- one_type_no = []
- item_no_info = pre_get_item_no('\n'+one_type, ans_item_no_type) # 初步按非表格非排列的形式获取题号信息
- print("初步题号:", item_no_info)
- if not item_no_info:
- # 表格答案和排列型答案应该不同时存在!!!
- if "table" in one_type:
- # 存在表格答案
- ans_no, table_ans = get_table_ans(one_type)
- if table_ans:
- one_type_ans.extend(table_ans)
- one_type_no.extend(ans_no)
- else:
- one_type_no, one_type_ans = get_array_ans(one_type, new_ans_no, item_res[0]['item_id'],
- temp_type=all_type[num])
- # elif re.search("[1-9]\s*[-~]\s*1?[0-9]\s*([A-Z]\s*){2,}", one_type):
- # no_info = re.findall("([1-9])\s*[-~]\s*(1?[0-9])\s*([A-Z]\s*){2,}", one_type)
- # no_list = [list(range(int(i[0]), int(i[1])+1)) for i in no_info]
- # row_split = re.split("\n*[1-9]\s*[-~]\s*1?[0-9]", one_type)
- # if len(row_split) - len(no_list) == 1:
- # if not row_split[0].strip():
- # one_type_no.extend(sum(no_list, []))
- # for k, j in enumerate(row_split[1:]):
- # print(re.split("\s+",j.strip()))
- # if len(re.findall("[A-Z](?!\))", j)) == len(no_list[k]):
- # one_type_ans.extend(re.findall("[A-Z](?<!\))", j))
- # elif len(re.split("\s+",j.strip())) == len(no_list[k]):
- # one_type_ans.extend(re.split("\s+",j.strip()))
- # elif no_list[0][0]>1:
- # tt0 = re.split("\s+",row_split[0].strip())
- # if len(tt0) < no_list[0][0]-1:
- # tt0 = re.findall("[A-Z](?!\))", row_split[0])
- # if len(tt0) >= no_list[0][0]-1:
- # one_type_ans.extend(tt0[-(no_list[0][0]-1):])
- # one_type_no.extend(list(range(1,no_list[0][0]))[-(no_list[0][0]-1):])
- # one_type_no.extend(sum(no_list, []))
- # for k, j in enumerate(row_split[1:]):
- # print(re.split("\s+",j.strip()))
- # if len(re.findall("[A-Z](?!\))", j)) == len(no_list[k]):
- # one_type_ans.extend(re.findall("[A-Z](?<!\))", j))
- # elif len(re.split("\s+",j.strip())) == len(no_list[k]):
- # one_type_ans.extend(re.split("\s+",j.strip()))
- # elif re.search("([1-9]|[1-4][0-9])\s*[A-Z]", one_type) and \
- # all_type[num].replace("题", "") in ['选择','单选','多选', '不定选择']:
- # row_ans = re.split("[1-4][0-9]|[1-9]", one_type)
- # if not row_ans[0].strip():
- # row_ans = row_ans[1:]
- # one_type_ans.extend(row_ans)
- # if new_ans_no:
- # one_type_no.extend(list(range(new_ans_no[-1]+1,new_ans_no[-1]+1+len(row_ans))))
- # else:
- # one_type_no.extend(list(range(1, 1 + len(row_ans))))
- # else:
- # # row_ans = sum([re.findall("[A-Z](?<!\))", str(k.group())) for k in re.finditer("[A-Z]{4,}", one_type)],[])
- # row_ans = re.findall("[A-Z](?<!\))", one_type)
- # one_type_ans.extend(row_ans)
- # if new_ans_no and row_ans:
- # one_type_no.extend(list(range(new_ans_no[-1] + 1, new_ans_no[-1] + 1 + len(row_ans))))
- # else:
- # one_type_no.extend(list(range(1, 1 + len(row_ans))))
- else: # 一行多个时,可能初步提取的题号只有1-2个
- # 表格答案和1-10BCCDBB 答案类型 应该不同时存在吧
- local_ans_no1, items_no_idx = get_right_no(item_no_info)
- print("local_items_no:",local_ans_no1)
- print("local_items_no_idx:", items_no_idx)
- is_from_0 = 1 # shi
- if items_no_idx[0] != 0 and local_ans_no1[0] > 1:
- items_no_idx.insert(0, 0)
- is_from_0 = 0 # fei
- one_item_split = [del_no(("\n" + one_type)[i:j], ans_item_no_type) for i, j in zip(items_no_idx, items_no_idx[1:] + [None])]
- while "table" in one_item_split[0]:
- ans_no, table_ans = get_table_ans(one_item_split[0])
- one_type_ans.extend(table_ans)
- one_type_no.extend(ans_no)
- one_item_split[0] = re.sub("<table>(((?!(</?table>)).)*)</table>", "", one_item_split[0])
- if not is_from_0:
- if re.match('(\n|^)故[::]?选[::]|故[::]?答案分?别?[为是]?'
- '|【([解分][析答]|详解|点[评睛])】|(答案|解析|详解)[::]',
- one_item_split[0].replace(" ", "")) is None:
- one_item_split = one_item_split[1:]
- items_no_idx = items_no_idx[1:]
- else:
- local_ans_no1.insert(0, local_ans_no1[0]-1)
- # -------开始判断答案个数是否正确------------------------
- one_type_no1 = one_type_no.copy()
- one_type_no1.extend(local_ans_no1)
- print('开始:', one_type_no1, one_type_num)
- if len(one_type_no1) != one_type_num:
- # 存在一行多个的情况,题号前必须有空格
- one_type_str = '\n' + '\n'.join(one_item_split) # 本部分的str
- ans_no1, ans_no_idx1 = get_many_ans_no(one_type_str, ans_item_no_type)
- local_ans_no2, ans_no_idx2 = get_right_no((ans_no_idx1, ans_no1), 1)
- one_type_no2 = one_type_no.copy()
- one_type_no2.extend(local_ans_no2)
- # print('one_type_no2:', one_type_no2)
- if len(one_type_no2) != one_type_num:
- # 题号前不要求空格时,出错的概率大,最好限制下范围,在按换行获取的题号中对不连续的题号做这种操作!!!!
- # 有题型行的一般归类比较工整点
- ans_no21, ans_no_idx21,ans_no22, ans_no_idx22 = get_many_ans_no(one_type_str, ans_item_no_type, reget=1)
- local_ans_no3, ans_no_idx3 = get_right_no((ans_no_idx21, ans_no21), 1)
- one_type_no3 = one_type_no.copy()
- one_type_no3.extend(local_ans_no3)
- # print('one_type_no3:', one_type_no3)
- if len(one_type_no3) != one_type_num:
- local_ans_no4, ans_no_idx4 = get_right_no((ans_no_idx22, ans_no22), 1)
- one_type_no4 = one_type_no.copy()
- one_type_no4.extend(local_ans_no4)
- if len(one_type_no4) != one_type_num:
- print("第" + str(num + 1) + "大题(" + all_type[num], ")答案个数有问题!!!")
- is_amount_wrong = 1
- if 0<one_type_num-len(one_type_no1)<=2:
- one_type_ans.extend(one_item_split)
- one_type_no = one_type_no1
- elif 0<one_type_num-len(one_type_no2)<=2:
- new_item_split = [del_no(one_type_str[i:j], ans_item_no_type) for i, j in
- zip(ans_no_idx2, ans_no_idx2[1:] + [None])]
- one_type_ans.extend(new_item_split)
- one_type_no = one_type_no2
- else:
- print("第四种题号切分方案")
- new_item_split = [del_no(one_type_str[i:j], ans_item_no_type) for i, j in
- zip(ans_no_idx4, ans_no_idx4[1:] + [None])]
- one_type_ans.extend(new_item_split)
- one_type_no = one_type_no4
- else:
- print("第三种题号切分方案")
- new_item_split = [del_no(one_type_str[i:j], ans_item_no_type) for i, j in
- zip(ans_no_idx3, ans_no_idx3[1:] + [None])]
- one_type_ans.extend(new_item_split)
- one_type_no = one_type_no3
- else:
- print("第二种题号切分方案")
- new_item_split = [del_no(one_type_str[i:j], ans_item_no_type) for i, j in
- zip(ans_no_idx2, ans_no_idx2[1:] + [None])]
- one_type_ans.extend(new_item_split)
- one_type_no = one_type_no2
- else:
- print("第一种题号切分方案")
- one_type_ans.extend(one_item_split)
- one_type_no = one_type_no1
- # --------------------------------------------------------------------------------------------
- print("第" + str(num + 1) + "大题(" + all_type[num], ")答案题号:", one_type_no)
- # print('答案:', one_type_ans)
- # ------------------------------------题目和答案组合---------------------------------------------------
- # 按题号进行组合
- if not is_amount_wrong:
- # all_ans.extend(one_type_ans)
- # new_ans_no.extend(one_type_no)
- # try:
- if one_type_ans:
- # print('one_type_no:',one_type_no)
- # print('one_type_ans:', one_type_ans)
- for k, one_item in enumerate(one_type_ans):
- temp_id = item_id + type_id # 当前答案在整个答案中的序号
- # ----------当前答案序号与题号对不上时------
- if len(one_type_no) == len(one_type_ans) and item_res[temp_id]["item_id"] != one_type_no[k]:
- # 从试题的res寻找与当前答案题号相同的题目的位置
- temp_id_list = [i for i, v in enumerate(item_res) if v["item_id"] == one_type_no[k]]
- if len(temp_id_list) == 1:
- temp_id = temp_id_list[0]
- elif len(temp_id_list) > 1:
- for j in temp_id_list:
- if "key" not in item_res[j].keys():
- temp_id = j
- #--------------------end------------------
- res_con = item_res[temp_id]['stem']
- item_res[temp_id].update(
- only_parse_split(one_item, all_type[num], res_con)) # 单道答案的结构化
- item_id += 1
- else:
- for k, one_item in enumerate(one_type_ans):
- temp_id = "no"
- # 从试题的res寻找与当前答案题号相同的题目的位置
- temp_id_list = [i for i, v in enumerate(item_res) if v["item_id"] == one_type_no[k]]
- if len(temp_id_list) == 1:
- temp_id = temp_id_list[0]
- elif len(temp_id_list) > 1:
- for j in temp_id_list:
- if "key" not in item_res[j].keys():
- temp_id = j
- if temp_id != 'no': # 找到题目和答案相同的题目的序号时
- res_con = item_res[temp_id]['stem']
- item_res[temp_id].update(
- only_parse_split(one_item, all_type[num], res_con)) # 单道答案的结构化
- item_id += 1
- type_id += item_id
- # ---------------------------------------------------
- # except:
- # rd1_is_fail = 1
- else:
- rd1_is_fail = 1
- return item_res, rd1_is_fail
- def get_table_ans(ans_str):
- """
- 获取表格中的答案
- :return:
- """
- table_ans = []
- ans_no = []
- if ans_str and "table" in ans_str: # 答案以表格形式呈现, 表格应放在前两行位置,不要插在答案中间
- row_list = [] # 要求表格形式为 横纵分明 ,不存在合并
- for one_table in re.finditer('<table>(((?!(</?table>)).)*)</table>', ans_str, re.S):
- for tt in re.finditer(r'<tr>(((?!(</?tr>)).)*)</tr>', one_table.group(1), re.S):
- tt_list = re.split(r'</p></td>|<td><p>|</td><td>|</td>|<td>', tt.group(1))
- # row_list.append([col for col in tt_list if col.strip()]) # 也有可能答案为空
- row_list.append(tt_list)
- if row_list:
- print("^^^^^^存在答案放在表格里的情况!^^^^^^^")
- if len(row_list) % 2 != 0:
- print('有表格形式呈现的答案不是偶数行')
- else:
- for k, v in enumerate(row_list):
- if (k + 1) % 2 == 1: # 奇数行==》答案序号行
- item_no = [int(i) if re.sub(r"[^\d]", "", i) else -1 for i in v]
- item_no_st = [num for num, i in enumerate(item_no) if i != -1] # 可能开头是-1
- # print(item_no_st)
- ans_no.extend([i for i in item_no if i != -1]) # 表格序号
- table_ans.extend(row_list[k + 1][item_no_st[0]: item_no_st[-1] + 1]) # 表格答案
- return ans_no, table_ans
- def only_parse_split(one_item_ans, item_type, res_con, reparse_n=1):
- """
- 拆分出答案和解析,主要针对答案页中的每个题的答案进行拆分
- :one_item: 单道题的答案解析部分,
- :reparse_n == 1:表示再解析
- :return:{'key': ,"parse": }
- """
- one_item_ans = re.sub("\n\s*(化学|物理|生物|和|与)+\s*【答案】\s*$", '', one_item_ans)
- dd = {'parse': one_item_ans, 'key': ""}
- temp_ans = one_item_ans
- one_item_ans = one_item_ans.split("【答案】", maxsplit=1)[-1]
- simp_item = re.sub(r"(【([解分][析答]|详解|点[评睛])】|答案|解析|详解)\s*[::]?", "", one_item_ans)
- simp_item = re.sub("[^\u4e00-\u9fa5∵∴]", "", simp_item)
- deng_num = re.findall(r"((?!(src|width|height|style)).)+?([==]).+?", one_item_ans, re.S)
- huanheng_num = re.findall("\n+", one_item_ans, re.S)
- if len(simp_item) < 10 and re.search("因为?|因此|所以|根据|依据|若|假设", simp_item) is None and len(deng_num) < 2:
- dd['parse'] = ""
- if len(huanheng_num) > 1:
- dd['parse'] = one_item_ans
- sim_parse = dd['parse'] # 去掉点评后用于找答案
- if re.search(r"【(解析|解答|分析|详解|点评|点睛)】\n?|(解析|解答|分析|详?解|点评|点睛)\s*[::]", one_item_ans):
- dd1 = dict(zip(["key", "parse_title", "parse"],
- re.split(r"【(解析|解答|分析|详解|点评|点睛)】\n?", one_item_ans, maxsplit=1)))
- if len(dd1)==1:
- dd1 = dict(zip(["key", "parse_title", "parse"],
- re.split(r"(解)\s*[::]", one_item_ans, maxsplit=1)))
- if "【答案】" in temp_ans:
- dd["key"] = dd1["key"]
- if len(dd1) >= 3:
- dd["parse"] = "【" + dd1["parse_title"] + "】" + dd1["parse"]
- del dd1["parse_title"]
- sim_parse = re.split("【点评】|【点睛】", dd["parse"])[0].strip()
- # 将解析中末尾出现的图片去掉
- while re.search('\n\s*<imgsrc\d+\sw_h=(\d+\*\d{3})/>\s*$', sim_parse):
- sim_parse = re.sub('\n\s*<imgsrc\d+\sw_h=(\d+\*\d{3})/>\s*$', "", sim_parse)
- if item_type.replace("题", "") in ["单选", "多选", "选择", "不定选择"]:
- ans = re.search(r'故选\s*[::]?\s*<imgsrc\d+\sdata-latex="([A-Z;;和与、、\s]+)"/>'
- r'|故选\s*[::]?\s*([A-Z;;和与、、\s]+)', dd["parse"].replace("$", ""))
- ans1 = re.search(r'故答案[为是有]\s*[::]\s*<imgsrc\d+\sdata-latex="([A-Z;;和与、、\s]+)"/>'
- r'|故答案[为是有]\s*[::]?\s*([A-Z;;和与、、\s]+)', dd["parse"].replace("$", ""))
- if ans:
- dd["key"] = ans.group(1) if ans.group(1) is not None else ans.group(2)
- if ans1:
- dd["key"] = ans1.group(1) if ans1.group(1) is not None else ans1.group(2)
- elif not dd['key']:
- dd['key'] = one_item_ans.strip()
- dd['key'] = re.sub(r"[.;;.]\s*$", "", dd['key'])
- elif item_type: # 把所有的图片能先提前替换比较好,后面匹配的话会容易些
- ans0 = re.search(r'故选\s*[::]?\s*([A-Z;;和与、、\s]+)[..;;。]?$', sim_parse) # 试验题中可能还有选择题
- ans01 = re.search(r'故选\s*[::]\s*<imgsrc\d+\sdata-latex="([A-Z;;和与、、\s]+)"/>', sim_parse.replace("$", "")) # 可能开始题型写错
- ans1 = re.search(r'(故|因[而此]|所以)\s*[::]?\s*(答案分?别?[为是填]?|填)\s*[::]?\s*(((?!(<img)).)+?)[..]?\s*(\n|$)', sim_parse)
- ans11 = re.search(r'((?<!解)答\s*[::]|整理得\s*[::]?)\s*(.+?)([..;;]?\s*$|[..]\s*\n)', sim_parse)
- ans2 = re.search(r'(故|因[而此]|所以)\s*[::]?\s*(答案分?别?[为是填]?|填)\s*[::]?\s*(<imgsrc.+?/>)[..]?\s*(\n|$)', sim_parse, re.S)
- ans22 = re.search(r'(故|因[而此]|所以)\s*[::]?\s*(答案分?别?[为是填]?|填)\s*[::]?\s*([^∴∵故因所即【】]+?)([..]\s*(\n|$)|$)', sim_parse)
- ans21 = re.search(r'综上所述\s*[::]\s*([^∴∵故因所即【】]+?)[..;;]\s*$', sim_parse)
- ans3 = re.search(r'(故|因[而此]|所以|∴)\s*[::]?.+?[为是填]\s*[::]?\s*([^∴∵故因所即则【】]+?)([..;;,,]\s*$|[..]\s*\n)', sim_parse) # 改添
- ans31 = re.search(r'(故|因[而此]|所以|∴)\s*([^当为是填∴∵因所故即则【】]+?)[..;;]\s*$', sim_parse) # 改添
- ans32 = re.search(r'(故|因[而此]|所以)\s*[::]?[^当为是填∴∵因所故即【】]+?[为是填]\s*[::]?\s*(<imgsrc.+?/>)[..]?\s*(\n|$)', sim_parse, re.S)
- ans4 = re.search(r'\n\s*[==]([^=\n]+?)[..]?\s*$', sim_parse)
- # ans42 = re.search(r'[==](?!")(((?!([故=∴即]|原式|因[而此]|所以|\n|=[^"])).)+?)[..]?\s*$', sim_parse)
- ans41 = re.search(r'原式\s*[==].+?[==](?!")(((?!(=|=[^"])).)+?|\s*<imgsrc.+?/>)([..]?\s*$|[..]\s*\n)', sim_parse)
- if reparse_n != 2 and "【答案】" not in one_item_ans and dd['parse'] and \
- len(re.findall(r"[((]\d[))]|[\n::;;。】]([((](i{1,3}|[ⅰⅱⅲⅳⅠⅡⅢIV①②③④])[))]|[①②③④]\s*(?![+-]))",
- sim_parse.replace(" ", ""))) > 1 and not (item_type == '填空题' and len(re.findall(r"_{2,}|_+([^_]*?)_+", res_con)) == 1):
- dd["key"] = "见解析"
- elif ans0:
- dd["key"] = ans0.group(1)
- elif ans01:
- dd["key"] = ans01.group(1)
- elif ans1 or ans11:
- dd["key"] = ans1.group(3) if ans1 else ans11.group(2)
- elif ans2:
- dd["key"] = ans2.group(3)
- elif ans22:
- dd["key"] = ans22.group(3)
- elif ans21:
- dd["key"] = ans21.group(1)
- elif (ans3 or ans31 or ans32) and '证明' not in one_item_ans:
- if ans3:
- dd["key"] = ans3.group(2)
- if ans31:
- dd["key"] = ans31.group(2)
- if ans32:
- dd["key"] = ans32.group(2)
- elif (ans4 or ans41) and '证明' not in one_item_ans:
- if ans4:
- dd["key"] = ans4.group(1)
- if ans41:
- dd["key"] = ans41.group(1)
- # if ans42:
- # dd["key"] = ans42.group(1)
- elif not dd['parse']:
- dd['key'] = one_item_ans.strip()
- else:
- dd["key"] = "见解析"
- else: # 题型未知
- if len(simp_item) < 10:
- dd["key"] = re.sub(r"【答案】|答案\s*[::]", "", one_item_ans.strip())
- else:
- ans1 = re.search(
- r'故选\s*[::]\s*<imgsrc\d+\sdata-latex="([A-Z;;和与、、\s]+)"/>|故选\s*[::]?\s*([A-Z;;和与、、\s]+)',
- dd["parse"].replace("$", ""))
- ans2 = re.search(r'故\s*[::]?\s*答案分?别?[为是]?\s*[::]?\s*(.+?)[..]\s*(\n|$)', dd["parse"])
- ans3 = re.search(r'(【答案】|答案)\s*[::]?(.+?)(\n|$)', dd["parse"])
- if ans1:
- dd["key"] = ans1.group(1) if ans1.group(1) is not None else ans1.group(2)
- elif ans2:
- dd["key"] = ans2.group(1)
- elif ans3:
- dd["key"] = ans3.group(2)
- dd["parse"] = dd["parse"].replace(ans3.group(0), "")
- elif not dd['key']:
- dd['key'] = "见解析"
- # print('最后:',dd)
- return dd
- def ans_structure_step1(anss, item_type_classify, item_res):
- """
- 针对答案部分解析结构化汇总
- anss : 整个答案部分
- :return: dd = {'parse': , 'key': }
- """
- anss = [k for k in anss if k.strip()]
- ans_label = [k for k, a in enumerate(anss) if re.match("【答案】", a.strip())]
- parse_label = [k for k, a in enumerate(anss) if re.match("【解析】", a.strip())]
- if len(ans_label) == 1 and len(parse_label) == 1:
- ans1 = anss[ans_label[0] + 1: parse_label[0]]
- parse1 = anss[parse_label[0]+1:]
- item_res = ans_structure_step2(ans1, item_type_classify, item_res,'group_ans')
- item_res = ans_structure_step2(parse1, item_type_classify, item_res, 'group_parse')
- else:
- item_res = ans_structure_step2(anss, item_type_classify, item_res)
- return item_res
- def ans_structure_step2(anss, item_type_classify, item_res, *group):
- """
- 拆分答案,并根据已拆分好的题目item_res 补上答案和解析
- 有的答案放在表格里,如选择题、填空题、判断题,有的一行多个答案
- 思路:1.先按一行没有多个题答案的情况取答案,数量与题干不同 时 >>>> 2.再按一行多个答案的情况取答案:
- 1)先判断表格,拿到表格的答案;2)一行多个答案
- anss: 一组按所有不重复题号的答案
- item_type_classify: 题目中对各题型的统计
- :return: [{'parse': , 'key': },{},{}]
- """
- while not anss[0]:
- anss = anss[1:]
- if re.match(".+?省.+?试[卷题]|[^a-zA-Z]*?【专题】", anss[0]):
- anss = anss[1:]
- # --------- 答案整体解析----存在一行中有选择题和填空题答案,填空题答案尽量每题占一行----------
- item_type_num = sum(list(item_type_classify.values()))
- all_item_ans = [] # 前期只记录表格答案和排列型答案
- table_ans = []
- ans_no = [] # 只记录表格答案和排列型答案的id
- # 默认表格答案放在最前面 !!!
- while anss and "table" in anss[0]: # 答案以表格形式呈现, 表格应放在前两行位置,不要插在答案中间
- row_list = [] # 要求表格形式为 横纵分明 ,不存在合并
- for tt in re.finditer('<tr>(((?!(</?tr>)).)*)</tr>', anss[0], re.S): # 先划分每行
- tt_list = re.split(r'</p></td>|<td><p>|</td><td>|</td>|<td>', tt.group(1)) # 再划分每列
- # row_list.append([col for col in tt_list if col.strip()]) # 也有可能答案为空
- row_list.append(tt_list)
- if row_list:
- print("^^^^^^存在答案放在表格里的情况!^^^^^^^")
- if len(row_list) % 2 != 0:
- print('表格形式呈现的答案不是偶数行')
- else:
- print("row_list:", row_list)
- for k, v in enumerate(row_list):
- # print('-----',v)
- if (k + 1) % 2 == 1: # 奇数行==》答案序号行
- item_no = [int(i) if re.sub(r"[^\d]", "", i) else -1 for i in v]
- item_no_st = [num for num, i in enumerate(item_no) if i != -1] # 可能开头是-1
- ans_no.extend([i for i in item_no if i != -1]) # 表格序号
- table_ans.extend(row_list[k + 1][item_no_st[0]: item_no_st[-1] + 1]) # 表格答案
- anss = anss[1:]
- print("表格答案:", table_ans)
- all_item_ans.extend(table_ans)
- if re.search("<table>.*?</table>", anss[0], re.S) is None:
- anss[0] = anss[0].split("</table>")[-1].replace("</div>", "")
- # 先按一行没有多个题答案的情况取答案
- anss_str = table_label_cleal("\n" + "\n".join(anss))
- # print(anss_str)
- # if re.search("<table>.+?</table>", anss_str,, re.S) is None: # 放在前面判断比较合适些
- # anss_str = anss_str.split("</table>")[-1].replace("</div>", "")
- ans_item_no_type = 1 # 初步定义答案的题号就是第一种类型!!!
- # 将序号前面是大写字母或分号的情况,加空
- anss_str = re.sub(r"([A-H])\s*[..](\s*([1-4][0-9]|[1-9])\s*[..、、])", r"\1 \2", anss_str)
- while re.search(r"([A-E])(([1-4][0-9]|[1-9])\s*[..、、]\s*[A-E])", anss_str):
- anss_str = re.sub(r"([A-E])(([1-4][0-9]|[1-9])\s*[..、、]\s*[A-E])", r"\1 \2", anss_str)
- def sub1(ss):
- if int(ss.group(3)) - int(ss.group(1)) == 1:
- return ss.group(1)+ss.group(2)+' '+ss.group(3)+'、'+ss.group(4)
- anss_str = re.sub(r"([1-4][0-9]|[1-9])\s*([..、、]\s*[A-E])\s*([1-4][0-9]|[1-9])\s*([A-E]\s*([1-4][0-9]|[1-9])\s*[..、、])", sub1, anss_str)
- anss_str = re.sub(r"([;;])(\s*([1-4][0-9]|[1-9])\s*[.、、])", r"\1 \2", anss_str)
- item_no_info = pre_get_item_no('\n' + anss_str, ans_item_no_type) # 按\n\d、
- print("初步题号:", item_no_info) # 初步按非表格非排列的形式获取题号信息
- if not item_no_info and len(anss_str.replace(" ", "")) < 50:
- item_0 = re.split(r"[1-9]\s*[..、、::]|[1-4][0-9]\s*[..、、::]", anss_str)
- if len(item_0) > 1:
- anss_str = item_0[0]
- array_ans_no, array_ans = get_array_ans(anss_str, ans_no, item_res[0]['item_id'])
- all_item_ans.extend(array_ans)
- ans_no.extend(array_ans_no)
- if len(item_0) > 1:
- all_item_ans.extend(item_0[1:])
- ans_no.extend(re.findall(r"([1-9]|[1-4][0-9])\s*[..、、::]", anss_str))
- # if re.search("[A-Z]{4,}", anss_str):
- # new_ans = []
- # rest_item0 = re.split(r"[1-9]\s*[..、、::]|[1-4][0-9]\s*[..、、::]", anss_str)
- # if len(rest_item0) > 1:
- # new_ans = re.findall("[A-Z](?<!\))", rest_item0[0])
- # new_ans.extend(rest_item0[1:])
- # else:
- # rest_item1 = re.split(r"[、、]", anss_str)
- # if len(rest_item1) > 1:
- # new_ans = re.findall("[A-Z](?<!\))", rest_item1[0])
- # new_ans.extend(rest_item1[1:])
- # if len(all_item_ans) + len(new_ans) < len(item_res):
- # new_ans = []
- # for k in re.finditer("[A-Z]{4,}", anss_str):
- # row_ans = re.findall("[A-Z](?<!\))", str(k.group()))
- # new_ans.append(row_ans)
- # all_item_ans.extend(new_ans)
- # if ans_no:
- # ans_no.extend(list(range(ans_no[-1] + 1, ans_no[-1] + 1 + len(new_ans))))
- # else:
- # ans_no.extend(list(range(item_res[0]['item_id'], 1 + len(new_ans))))
- # elif re.search("^[\d\sA-Z]+$", anss_str):
- # new_ans = []
- # rest_item0 = re.split(r"[1-4][0-9]|[1-9]", anss_str)
- # if len(rest_item0) > 1:
- # new_ans = re.findall("[A-Z](?<!\))", rest_item0[0])
- # if ans_no:
- # ans_no.extend(list(range(ans_no[-1] + 1, ans_no[-1] + 1 + len(new_ans))))
- # else:
- # ans_no.extend(list(range(item_res[0]['item_id'], 1 + len(new_ans))))
- # new_ans.extend(rest_item0[1:])
- # ans_no.extend([int(i) for i in re.findall(r"[1-4][0-9]|[1-9]", anss_str)])
- # all_item_ans.extend(new_ans)
- if len(ans_no) == item_type_num or len(ans_no) == len(item_res): # 只有表格答案或排列答案的情况
- print('答案第1种切分方案:只有表格答案或排列答案的情况!')
- item_res = get_ans_match(item_res, all_item_ans, ans_no, group)
- else:
- ans_no0, ans_no_idx0 = get_right_no(item_no_info) # 初始题号纠正,第一个不换行的题号可能被剔除
- print("ans_no0:", ans_no0)
- print("ans_no_idx0:", ans_no_idx0)
- is_from_0 = 1 # shi
- if ans_no_idx0[0] != 0 and ans_no0[0] > 1:
- ans_no_idx0.insert(0, 0)
- is_from_0 = 0 # fei
- # 除表格、排列型之外的答案
- rest_item_split = [('\n' + anss_str)[i:j] for i, j in zip(ans_no_idx0, ans_no_idx0[1:] + [None])]
- if not is_from_0: # 不是从0开始, 从0开始的话就不用判断
- if re.match('(\n|^)故[::]?选[::]|故[::]?答案分?别?[为是]?'
- '|【([解分][析答]|详解|点[评睛])】|(答案|解析|详解)[::]',
- rest_item_split[0].replace(" ", "")) is None:
- if item_no_info[0][0]==0 or item_no_info[0][1]=='1':
- # 说明rest_item_split[0] 是一行多个排列的答案
- array_ans_no, array_ans = get_array_ans(rest_item_split[0], ans_no, item_res[0]['item_id'])
- if array_ans:
- ans_no.extend(array_ans_no)
- all_item_ans.extend(array_ans)
- # rest_item_split = sum([array_ans.copy(), rest_item_split[1:]], [])
- rest_item_split = rest_item_split[1:]
- ans_no_idx0 = ans_no_idx0[1:]
- else:
- print("rest_item_split[0]:", rest_item_split[0])
- item_0 = re.split(r"([1-9]\s*[..、、::]\s*(?![A-F])|[1-4][0-9]\s*[..、、::]\s*(?![A-F]))", rest_item_split[0], maxsplit=1)
- if len(item_0) > 1:
- rest_item_split[0] = item_0[0]
- # print(item_0[0])
- rest_item_split.insert(1, "\n" + "".join(item_0[1:]))
- ans_no0.insert(0, int(re.findall("[1-9]|[1-4][0-9]", item_0[1])[0])) # 这时补充的肯定与后面序号差别比较大
- ans_no_idx0[0] = re.search(r"([1-9]|[1-4][0-9])\s*[..、、::]\s*(?![A-F])", rest_item_split[0]).start()
- array_ans_no, array_ans = get_array_ans(rest_item_split[0], ans_no, item_res[0]['item_id'])
- if array_ans:
- ans_no.extend(array_ans_no)
- all_item_ans.extend(array_ans)
- # rest_item_split = sum([array_ans.copy(), rest_item_split[1:]], [])
- else:
- ans_no_idx0 = ans_no_idx0[1:]
- if 'imgsrc' in rest_item_split[0]: # 针对答案前面存在图的情况,图暂时不管(不清楚是什么图)
- temp_str0 = re.sub('<imgsrc.*?/>', "", rest_item_split[0]).replace(" ", "")
- if len(temp_str0) < 3 and ans_no0 and ans_no0[0] > 9:
- is_from_0 = 1
- rest_item_split = rest_item_split[1:]
- anss_str = "\n" + "\n".join(rest_item_split[1:]) # 更新anss_str
- # if re.search("[1-9]\s*[-~~]\s*([1-9]|1[0-9]).+?", rest_item_split[0]):
- # new_ans = re.findall("[A-Z](?<!\))", rest_item_split[0])
- # if len(new_ans) == ans_no0[0]-1:
- # if ans_no:
- # ans_no.extend(list(range(ans_no[-1] + 1, ans_no[-1] + 1 + len(new_ans))))
- # else:
- # ans_no.extend(list(range(item_res[0]['item_id'], 1 + len(new_ans))))
- # new_ans.extend(rest_item_split[1:])
- # rest_item_split = new_ans.copy()
- # if re.search("[A-Z]{4,}", rest_item_split[0]): # 一般是单选题,多选题答案放在一起的情况暂不考虑,还没遇到
- # # 存在排列型答案
- # rest_item0 = re.split(r"([1-9]\s*[..、、::]|[1-4][0-9]\s*[..、、::])", rest_item_split[0])
- # if len(rest_item0) > 1:
- # rest_item_split[0] = rest_item0[0]
- # # rest_item_split[1] = "\n"+" ".join(rest_item0[1:]) + rest_item_split[1]
- # rest_item_split.insert(1, "\n"+" ".join(rest_item0[1:]))
- # ans_no0.insert(0, int(re.findall("[1-9]|[1-4][0-9]", rest_item0[1])[0])) # 这时补充的肯定与后面序号差别比较大
- # # print('rest_item0:',rest_item0)
- # new_ans = []
- # for k in re.finditer("[A-Z]{4,}", rest_item_split[0]):
- # row_ans = re.findall("[A-Z](?<!\))", str(k.group()))
- # new_ans.append(row_ans)
- # if len(new_ans) != ans_no0[0]-1:
- # new_ans = re.findall("[A-Z](?<!\))", rest_item_split[0])
- # if ans_no:
- # ans_no.extend(list(range(ans_no[-1] + 1, ans_no[-1] + 1 + len(new_ans))))
- # else:
- # ans_no.extend(list(range(item_res[0]['item_id'], 1 + len(new_ans)))) # 默认从1开始,严谨的话,应按题目中第一题的题号开始计
- # # 更新anss_str,all_item_ans,rest_item_split!!!!!!
- # # anss_str = re.sub("^.+?"+"".join(new_ans[-4:]), "", anss_str)
- # anss_str = "\n".join(rest_item_split[1:])
- # all_item_ans.extend(new_ans)
- # new_ans.extend(rest_item_split[1:])
- # rest_item_split = new_ans.copy()
- # elif re.search("^[\d\sA-Z]+$", rest_item_split[0]):
- # new_ans = []
- # rest_item0 = re.split(r"[1-4][0-9]|[1-9]", rest_item_split[0])
- # if len(rest_item0) > 1:
- # new_ans = re.findall("[A-Z](?<!\))", rest_item0[0])
- # if ans_no:
- # ans_no.extend(list(range(ans_no[-1] + 1, ans_no[-1] + 1 + len(new_ans))))
- # else:
- # ans_no.extend(list(range(item_res[0]['item_id'], 1 + len(new_ans))))
- # new_ans.extend(rest_item0[1:])
- # ans_no.extend([int(i) for i in re.findall(r"[1-4][0-9]|[1-9]", rest_item_split[0])])
- # rest_item_split = rest_item_split[1:]
- # anss_str = "\n"+"\n".join(rest_item_split)
- # all_item_ans.extend(new_ans)
- # else:
- # ans_no_idx0 = ans_no_idx0[1:]
- # if 'imgsrc' in rest_item_split[0]: # 针对答案前面存在图的情况
- # temp_str0 = re.sub('<imgsrc.*?/>', "", rest_item_split[0]).replace(" ", "")
- # if len(temp_str0)<3 and ans_no0 and ans_no0[0]>9:
- # is_from_0 = 1
- # rest_item_split = rest_item_split[1:]
- else:
- ans_no0.insert(0, ans_no0[0] - 1)
- # 没有一行多个答案的情况
- print("ans_no:",ans_no) # ans_no只记录表格答案和排列型答案
- pre_split_ansinfo_list = all_item_ans, ans_no, ans_no0, ans_no_idx0, anss_str, is_from_0, ans_item_no_type
- all_item_ans, ans_no = ans_select(item_res, item_type_num, rest_item_split, pre_split_ansinfo_list)
- # ans_no1 = ans_no.copy()
- # ans_no1.extend(ans_no0) # ans_no0 按换行切分的答案
- # print("ans_no0:", ans_no0, len(ans_no0), len(ans_no1))
- # # print("ans_no_idx0:", ans_no_idx0, )
- # # print("rest_item_split:", rest_item_split,len(rest_item_split))
- # if len(ans_no1) != item_type_num and len(ans_no1) != len(item_res):
- # if is_from_0: # 答案确实只有部分或前面答案是图片形式
- # print('答案第6种切分方案:答案确实只有部分或前面答案是图片形式!')
- # rest_item_split = [del_no(k) for k in rest_item_split]
- # all_item_ans.extend(rest_item_split)
- # ans_no = ans_no1
- # item_res = get_ans_match(item_res, all_item_ans, ans_no1, group)
- # else:
- # # 存在一行多个的情况,题号前必须有空格
- # temp_ans_no1, ans_no_idx1 = get_many_ans_no("\n"+anss_str, ans_item_no_type) # 初步获取
- # temp_ans_no1, ans_no_idx1 = get_right_no((ans_no_idx1, temp_ans_no1), 1) # 筛选
- # ans_no2 = ans_no.copy()
- # ans_no2.extend(temp_ans_no1)
- # if len(ans_no2) != item_type_num and len(ans_no2) != len(item_res): # 一行多个,题号前可有可无空格
- # # 题号前不要求空格时,出错的概率大,最好限制下范围,在按换行获取的题号中对不连续的题号做这种操作!!!!
- # new_anss_str = "\n"+anss_str
- # seq_no = find_seq_num(ans_no1)
- # ans_no_after = []
- # no_idx_after = []
- # if seq_no:
- # ans_no_after.extend(ans_no0[ans_no0.index(seq_no[-1][0]):])
- # no_idx_after.extend(ans_no_idx0[ans_no0.index(seq_no[-1][0]):])
- # breakp = ans_no_idx0[ans_no0.index(seq_no[-1][0])]
- # new_anss_str = ("\n"+anss_str)[:breakp]
- # temp_ans_no21, ans_no_idx21, temp_ans_no22, ans_no_idx22 = get_many_ans_no(new_anss_str, ans_item_no_type, reget=1)
- # temp_ans_no21.extend(ans_no_after)
- # ans_no_idx21.extend(no_idx_after)
- # temp_ans_no21, ans_no_idx21 = get_right_no((ans_no_idx21, temp_ans_no21), 1) # 筛选
- # ans_no3 = ans_no.copy()
- # ans_no3.extend(temp_ans_no21)
- # if len(ans_no3) != item_type_num and len(ans_no3) != len(item_res): # 无空格情况下,遇到两位数先取2位
- # temp_ans_no22.extend(ans_no_after)
- # ans_no_idx22.extend(no_idx_after)
- # temp_ans_no22, ans_no_idx22 = get_right_no((ans_no_idx22, temp_ans_no22), 1) # 筛选
- # ans_no4 = ans_no.copy()
- # ans_no4.extend(temp_ans_no22)
- # if len(ans_no4) != item_type_num and len(ans_no4) != len(item_res):
- # print('答案格式(无序号)影响答案个数有问题!!!', )
- # # 也有可能题目比答案多,如理综题目不全
- # if abs(len(item_res) - len(ans_no1)) <= 2 or (len(ans_no1) > len(item_res) and ans_no1[0]==1):
- # print('1111111111111111111111')
- # rest_item_split = [del_no(k) for k in rest_item_split]
- # all_item_ans.extend(rest_item_split)
- # ans_no = ans_no1
- # # item_res = get_ans_match(item_res, all_item_ans, ans_no1, group)
- # elif abs(len(item_res) - len(ans_no2)) <= 2 or (len(ans_no2) > len(item_res) and ans_no2[0]==1):
- # print('2222222222222222222222',ans_no2)
- # rest_item_split = [del_no(("\n" + anss_str)[i:j]) for i, j in
- # zip(ans_no_idx1, ans_no_idx1[1:] + [None])]
- # all_item_ans.extend(rest_item_split)
- # ans_no = ans_no2
- # # item_res = get_ans_match(item_res, all_item_ans, ans_no2, group)
- # # elif all_item_ans: # 最后起码把表格答案、排列型答案先填上
- # # item_res = get_ans_match(item_res, all_item_ans, ans_no, group)
- # else:
- # print('答案第5种切分方案:存在一行多个答案,每个答案题号前可以没有空格2')
- # rest_item_split = [del_no(("\n" + anss_str)[i:j]) for i, j in
- # zip(ans_no_idx22, ans_no_idx22[1:] + [None])]
- # all_item_ans.extend(rest_item_split)
- # ans_no = ans_no4
- # # item_res = get_ans_match(item_res, all_item_ans, ans_no4, group)
- # else:
- # print('答案第4种切分方案:存在一行多个答案,每个答案题号前可以没有空格1')
- # rest_item_split = [del_no(("\n" + anss_str)[i:j]) for i, j in
- # zip(ans_no_idx21, ans_no_idx21[1:] + [None])]
- # all_item_ans.extend(rest_item_split)
- # ans_no = ans_no3
- # # item_res = get_ans_match(item_res, all_item_ans, ans_no3, group)
- # else:
- # print('答案第3种切分方案:存在一行多个答案,且每个答案题号前必须有空格或顶格')
- # rest_item_split = [del_no(("\n"+anss_str)[i:j]) for i, j in zip(ans_no_idx1, ans_no_idx1[1:] + [None])]
- # all_item_ans.extend(rest_item_split)
- # ans_no = ans_no2
- # # item_res = get_ans_match(item_res, all_item_ans, ans_no2, group)
- #
- # else:
- # print('答案第2种切分方案:没有一行多个答案的情况!')
- # rest_item_split = [del_no(k) for k in rest_item_split]
- # all_item_ans.extend(rest_item_split)
- # ans_no = ans_no1
- item_res = get_ans_match(item_res, all_item_ans, ans_no, group)
- return item_res
- def get_array_ans(one_item, raw_ans_no, st_no, temp_type=""):
- """
- 获取排列型的答案
- :param one_item: 获取排列型的答案的文本
- :param raw_ans_no: 已有的答案id
- :param st_no: 题文中的初始题号
- :param temp_type: 题型
- :return:
- """
- one_item_no, one_item_ans = [], []
- if re.search("([1-9]|1[0-9])\s*[-~-~~]\s*1?[0-9]\s*([A-Z]\s*){2,}", one_item):
- no_info = re.findall("([1-9]|1[0-9])\s*[-~]\s*(1?[0-9])\s*([A-Z]\s*){2,}", one_item)
- no_list = [list(range(int(i[0]), int(i[1]) + 1)) for i in no_info]
- row_split = re.split("\n*[1-9]\s*[-~-~~]\s*1?[0-9]|\n*1[0-9]\s*[-~-~~]\s*1?[0-9]", one_item)
- if len(row_split) - len(no_list) == 1:
- if not row_split[0].strip():
- one_item_no.extend(sum(no_list, []))
- for k, j in enumerate(row_split[1:]):
- if len(re.findall("[A-Z](?!\))", j)) == len(no_list[k]):
- one_item_ans.extend(re.findall("[A-Z](?<!\))", j))
- elif len(re.split("\s+", j.strip())) == len(no_list[k]):
- one_item_ans.extend(re.split("\s+", j.strip()))
- elif len(re.split("[、、]+", j.strip())) == len(no_list[k]):
- one_item_ans.extend(re.split("[、、]+", j.strip()))
- elif no_list[0][0] > 1:
- tt0 = re.split("\s+", row_split[0].strip())
- if len(tt0) < no_list[0][0] - 1:
- tt0 = re.findall("[A-Z](?!\))", row_split[0])
- if len(tt0) >= no_list[0][0] - 1:
- one_item_ans.extend(tt0[-(no_list[0][0] - 1):])
- one_item_no.extend(list(range(1, no_list[0][0]))[-(no_list[0][0] - 1):])
- one_item_no.extend(sum(no_list, []))
- for k, j in enumerate(row_split[1:]):
- print(re.split("\s+", j.strip()))
- if len(re.findall("[A-Z](?!\))", j)) == len(no_list[k]):
- one_item_ans.extend(re.findall("[A-Z](?<!\))", j))
- elif len(re.split("\s+", j.strip())) == len(no_list[k]):
- one_item_ans.extend(re.split("\s+", j.strip()))
- elif len(re.split("[、、]+", j.strip())) == len(no_list[k]):
- one_item_ans.extend(re.split("[、、]+", j.strip()))
- elif (re.search("([1-9]|1[0-9])\s*[A-Z]", one_item) and temp_type.replace("题", "")
- in ['选择', '单选', '多选', '不定选择']) or re.search("^[\d\sA-Z..、、\n]+$", one_item): # 1A 2B 3C 4D
- row_ans = re.split("[1-4][0-9]|[1-9]", one_item)
- if re.search("[A-F]", row_ans[0].strip()):
- new_ans = re.findall("[A-Z](?<!\))", row_ans[0])
- one_item_ans.extend(new_ans)
- if raw_ans_no:
- one_item_no.extend(list(range(raw_ans_no[-1] + 1, raw_ans_no[-1] + 1 + len(new_ans))))
- else:
- one_item_no.extend(list(range(st_no, 1 + len(new_ans))))
- else:
- row_ans = row_ans[1:]
- one_item_ans.extend(row_ans)
- one_item_no.extend([int(i) for i in re.findall(r"1[0-9]|[1-9]", one_item)])
- else:
- # row_ans = sum([re.findall("[A-Z](?<!\))", str(k.group())) for k in re.finditer("[A-Z]{4,}", one_type)],[])
- row_ans = re.findall("[A-Z](?<!\))", one_item)
- one_item_ans.extend(row_ans)
- if raw_ans_no and row_ans:
- one_item_no.extend(list(range(raw_ans_no[-1] + 1, raw_ans_no[-1] + 1 + len(row_ans))))
- else:
- one_item_no.extend(list(range(1, 1 + len(row_ans))))
- return one_item_no, one_item_ans
- def ans_select(item_res, item_type_num, rest_item_split, pre_split_ansinfo_list):
- """
- 按换行切分答案、按一行多个答案(含答案序号前有空格或非空格两种情况),
- 按此三种方法,与题文试题的切分结构进行对比而选取答案
- :param item_type_num:
- :param ans_no:
- :param ans_str:
- :return:
- """
- all_item_ans, ans_no, ans_no0, ans_no_idx0, anss_str, is_from_0, ans_item_no_type = pre_split_ansinfo_list
- ans_no1 = ans_no.copy()
- ans_no1.extend(ans_no0)
- if len(ans_no1) != item_type_num and len(ans_no1) != len(item_res):
- if is_from_0: # 答案确实只有部分或前面答案是图片形式
- print('答案第6种切分方案:答案确实只有部分或前面答案是图片形式!')
- rest_item_split = [del_no(k) for k in rest_item_split]
- all_item_ans.extend(rest_item_split)
- ans_no = ans_no1
- # item_res = get_ans_match(item_res, all_item_ans, ans_no1, group)
- else:
- # 存在一行多个的情况,题号前必须有空格
- temp_ans_no1, ans_no_idx1 = get_many_ans_no("\n" + anss_str, ans_item_no_type) # 初步获取
- temp_ans_no1, ans_no_idx1 = get_right_no((ans_no_idx1, temp_ans_no1), 1) # 筛选
- ans_no2 = ans_no.copy()
- ans_no2.extend(temp_ans_no1)
- if len(ans_no2) != item_type_num and len(ans_no2) != len(item_res): # 一行多个,题号前可有可无空格
- # 题号前不要求空格时,出错的概率大,最好限制下范围,在按换行获取的题号中对不连续的题号做这种操作!!!!
- new_anss_str = "\n" + anss_str
- seq_no = find_seq_num(ans_no1)
- ans_no_after = []
- no_idx_after = []
- if seq_no:
- ans_no_after.extend(ans_no0[ans_no0.index(seq_no[-1][0]):])
- no_idx_after.extend(ans_no_idx0[ans_no0.index(seq_no[-1][0]):])
- breakp = ans_no_idx0[ans_no0.index(seq_no[-1][0])]
- new_anss_str = ("\n" + anss_str)[:breakp]
- temp_ans_no21, ans_no_idx21, temp_ans_no22, ans_no_idx22 = get_many_ans_no(new_anss_str,
- ans_item_no_type, reget=1)
- temp_ans_no21.extend(ans_no_after)
- ans_no_idx21.extend(no_idx_after)
- temp_ans_no21, ans_no_idx21 = get_right_no((ans_no_idx21, temp_ans_no21), 1) # 筛选
- ans_no3 = ans_no.copy()
- ans_no3.extend(temp_ans_no21)
- if len(ans_no3) != item_type_num and len(ans_no3) != len(item_res): # 无空格情况下,遇到两位数先取2位
- temp_ans_no22.extend(ans_no_after)
- ans_no_idx22.extend(no_idx_after)
- temp_ans_no22, ans_no_idx22 = get_right_no((ans_no_idx22, temp_ans_no22), 1) # 筛选
- ans_no4 = ans_no.copy()
- ans_no4.extend(temp_ans_no22)
- if len(ans_no4) != item_type_num and len(ans_no4) != len(item_res):
- print('答案格式(无序号)影响答案个数有问题!!!', )
- # 也有可能题目比答案多,如理综题目不全
- if abs(len(item_res) - len(ans_no1)) <= 2 or (len(ans_no1) > len(item_res) and ans_no1[0] == 1):
- print('1111111111111111111111')
- rest_item_split = [del_no(k) for k in rest_item_split]
- all_item_ans.extend(rest_item_split)
- ans_no = ans_no1
- # item_res = get_ans_match(item_res, all_item_ans, ans_no1, group)
- elif abs(len(item_res) - len(ans_no2)) <= 2 or (
- len(ans_no2) > len(item_res) and ans_no2[0] == 1):
- print('2222222222222222222222', ans_no2)
- rest_item_split = [del_no(("\n" + anss_str)[i:j]) for i, j in
- zip(ans_no_idx1, ans_no_idx1[1:] + [None])]
- all_item_ans.extend(rest_item_split)
- ans_no = ans_no2
- # item_res = get_ans_match(item_res, all_item_ans, ans_no2, group)
- # elif all_item_ans: # 最后起码把表格答案、排列型答案先填上
- # item_res = get_ans_match(item_res, all_item_ans, ans_no, group)
- else:
- print('答案第5种切分方案:存在一行多个答案,每个答案题号前可以没有空格2')
- rest_item_split = [del_no(("\n" + anss_str)[i:j]) for i, j in
- zip(ans_no_idx22, ans_no_idx22[1:] + [None])]
- all_item_ans.extend(rest_item_split)
- ans_no = ans_no4
- # item_res = get_ans_match(item_res, all_item_ans, ans_no4, group)
- else:
- print('答案第4种切分方案:存在一行多个答案,每个答案题号前可以没有空格1')
- rest_item_split = [del_no(("\n" + anss_str)[i:j]) for i, j in
- zip(ans_no_idx21, ans_no_idx21[1:] + [None])]
- all_item_ans.extend(rest_item_split)
- ans_no = ans_no3
- # item_res = get_ans_match(item_res, all_item_ans, ans_no3, group)
- else:
- print('答案第3种切分方案:存在一行多个答案,且每个答案题号前必须有空格或顶格')
- rest_item_split = [del_no(("\n" + anss_str)[i:j]) for i, j in
- zip(ans_no_idx1, ans_no_idx1[1:] + [None])]
- all_item_ans.extend(rest_item_split)
- ans_no = ans_no2
- # item_res = get_ans_match(item_res, all_item_ans, ans_no2, group)
- else:
- print('答案第2种切分方案:没有一行多个答案的情况!')
- rest_item_split = [del_no(k) for k in rest_item_split]
- all_item_ans.extend(rest_item_split)
- ans_no = ans_no1
- return all_item_ans, ans_no
- # item_res = get_ans_match(item_res, all_item_ans, ans_no, group)
- def get_ans_match(item_res, all_ans, ans_no, *group):
- """
- 根据切分后的答案及其题号,与前面试题进行匹配更新
- :param item_res:
- :param all_ans:
- :param ans_no:
- :return:
- """
- for k, one_ans in enumerate(all_ans):
- temp_id = "no"
- # 从试题的res寻找与当前答案题号相同的题目的位置
- temp_id_list = [i for i, v in enumerate(item_res) if v["item_id"] == ans_no[k]]
- if len(temp_id_list) == 1:
- temp_id = temp_id_list[0]
- elif len(temp_id_list) > 1:
- for j in temp_id_list:
- if "key" not in item_res[j].keys():
- temp_id = j
- if temp_id != 'no': # 找到题目和答案相同的题目的序号时,没找到就先不要答案了
- res_con = item_res[temp_id]['stem']
- if group==((),) or not group: # 这里group参数是双层嵌套不定参数,if not group[0]
- simp_res = only_parse_split(one_ans, item_res[k]["type"], res_con)
- item_res[temp_id].update(simp_res)
- else:
- if group == 'group_ans':
- item_res[temp_id]['key'] = one_ans
- item_res[temp_id]['parse'] = ""
- if group == 'group_parse':
- item_res[temp_id]['parse'] = one_ans
- if not item_res[temp_id]['key']:
- item_res[temp_id]['key'] = '见解析'
- return item_res
- # def stem_ans_struc_combine(item_type_classify, item_res, all_item_ans, ans_no, group):
- # """
- # 题干结构化与答案结构化的合并
- # :return:
- # """
- # print("item_type_classify:", item_type_classify)
- # print("题干中的题目数量:", len(item_res))
- # print("答案中的题目数量:", len(all_item_ans))
- # if item_type_classify and len(all_item_ans) == sum(list(item_type_classify.values())):
- # res1 = []
- # for num1, one_ans in enumerate(all_item_ans):
- # parse = only_parse_split(one_ans, item_res[num1]["type"], item_res[num1]['stem'])
- # res1.append(parse)
- # return res1, 1
- # elif not item_type_classify and len(all_item_ans) == len(item_res):
- # res1 = []
- # for num1, one_ans in enumerate(all_item_ans):
- # parse = only_parse_split(one_ans, item_res[num1]["type"], item_res[num1]['stem'])
- # res1.append(parse)
- # return res1, 1
- # else:
- # print('答案数量与题干数量不一致,请检查题干和答案中的题号,是否有遗漏答案或答案格式不对;',
- # '答案中若存在一行多个答案时,保证每个题的答案间要留有多个空格!', 2)
- # print("试题个数:", len(item_res))
- # print("答案中的题号:", ans_no)
- # # ----------------------是否正确对上序号还需进一步验证!!!!!!!!!!-------------------------------
- # res1 = []; simp_res = []
- # err_n = 0 # 与题目id没对上号的个数, 默认答案一般也是从前往后排序
- # for k, one_item in enumerate(item_res): # 以题目为主
- # search_range = ans_no
- # if k+3-err_n <= len(ans_no):
- # search_range = ans_no[k-err_n:k+3-err_n]
- # elif k-err_n < len(ans_no):
- # search_range = ans_no[k-err_n:]
- # # print("答案的搜索范围search_range:",search_range)
- # if one_item['item_id'] in search_range: # 在对应位置前
- # ans_no_st = [k1+k-err_n for k1, v1 in enumerate(search_range) if v1 == one_item['item_id']] # 默认取第一个作为对应答案
- # # print("答案的位置{0}:{1}, ----对应题目id:{2}".format(ans_no_st, all_item_ans[ans_no_st[0]],one_item['item_id']))
- # parse = only_parse_split(all_item_ans[ans_no_st[0]], one_item["type"], one_item['stem'])
- # one_item['key'] = parse['key']
- # one_item['parse'] = parse['parse']
- # res1.append(one_item)
- # if group == 'group_ans':
- # simp_res.append({'parse': "", 'key': parse['key'],'item_id':one_item['item_id']})
- # if group == 'group_parse':
- # simp_res.append({'parse': parse['parse'], 'key': parse['key'],'item_id':one_item['item_id']})
- # else:
- # err_n += 1
- # one_item.update({'parse': "", 'key': ""})
- # res1.append(one_item)
- # if group:
- # simp_res.append({'parse': '', 'key': '', 'item_id': one_item['item_id']})
- # if simp_res:
- # return simp_res, 1
- #
- # return res1, 2
- # def ans_structure_step1(anss, item_type_classify, item_res):
- # """
- # 针对答案部分解析结构化汇总
- # anss : 整个答案部分
- # :return: dd = {'parse': , 'key': }
- # """
- # anss = [k for k in anss if k.strip()]
- # ans_label = [k for k, a in enumerate(anss) if re.match("【答案】", a.strip())]
- # parse_label = [k for k, a in enumerate(anss) if re.match("【解析】", a.strip())]
- # if len(ans_label) == 1 and len(parse_label) == 1:
- # ans1 = anss[ans_label[0] + 1: parse_label[0]]
- # parse1 = anss[parse_label[0]+1:]
- # res_ans, flag1 = ans_structure_step2(ans1, item_type_classify, item_res,'group_ans')
- # res_parse, flag2 = ans_structure_step2(parse1, item_type_classify, item_res, 'group_parse')
- # if flag1 == flag2 == 1:
- # for idx, item_r in enumerate(item_res):
- # if not res_ans[idx]['key']:
- # if not res_parse[idx]['key']:
- # item_res[idx]['key'] = "见解析"
- # else:
- # item_res[idx]['key'] = res_parse[idx]['key']
- # else:
- # item_res[idx]['key'] = res_ans[idx]['key']
- #
- # if not res_ans[idx]['parse']:
- # item_res[idx]['parse'] = res_parse[idx]['parse']
- # else: # 解析中的parse肯定有
- # item_res[idx]['parse'] = res_ans[idx]['parse']+"<br/>【解析】"+res_parse[idx]['parse']
- # return item_res
- # elif flag1 == 2:
- # return "【答案】组中题型数量与题目中不一致,请重点检查题目序号,重新手输题目序号"
- # elif flag2 == 2:
- # return "【解析】组中题型数量与题目中不一致,请重点检查题目序号,重新手输题目序号"
- # else:
- # return '【答案】组和【解析】组中题型数量与题目中均不一致,请重点检查题目序号,重新手输题目序号'
- # else:
- # res_ans, flag1 = ans_structure_step2(anss, item_type_classify, item_res)
- # if flag1 == 1:
- # for idx, item_r in enumerate(item_res):
- # item_res[idx]['key'] = res_ans[idx]['key']
- # item_res[idx]['parse'] = res_ans[idx]['parse']
- # else:
- # # return "答案中题目数量与题目中不一致,①请重点检查题目序号,重新手输题目序号;②将参考答案开头没用的信息去掉;" \
- # # "③是否有遗漏答案或答案格式不对;④答案中若存在一行多个答案时,保证每个题的答案间要留有多个空格!"
- # return res_ans
- # return item_res
- #
- #
- # def ans_structure_step2(anss, item_type_classify, item_res, *group):
- # """
- # 拆分答案,并根据已拆分好的题目item_res 补上答案和解析
- # 有的答案放在表格里,如选择题、填空题、判断题,有的一行多个答案
- # 思路:1.先按一行没有多个题答案的情况取答案,数量与题干不同 时 >>>> 2.再按一行多个答案的情况取答案:
- # 1)先判断表格,拿到表格的答案;2)一行多个答案
- # anss: 一组按所有不重复题号的答案
- # item_type_classify: 题目中对各题型的统计
- # :return: [{'parse': , 'key': },{},{}]
- # """
- # while not anss[0]:
- # anss = anss[1:]
- # if re.match(".+?省.+?试[卷题]|[^a-zA-Z]*?【专题】", anss[0]):
- # anss = anss[1:]
- #
- # # # 预处理: 对答案部分的题号进行处理, 将(\d)类型的题号改为\d、类型
- # # sub_item_no = [int(no[0]+no[2]) for no in
- # # re.findall(r'\n\s*([1-9]|[1-4][0-9])\s*[..、、]|\n\s*([1-9]|[1-4][0-9])\s*[..、、].+?\s+([1-9]|[1-4][0-9])\s*[..、、].+?',
- # # "\n" + "\n".join(anss))]
- # # if len(sub_item_no) <= 2:
- # # sub_item_no = [int(no[0]+no[2]) for no in re.findall(r'\n\s*\(([1-9]|[1-4][0-9])\)\s*[..、、]?'
- # # r'|\n\s*\(([1-9]|[1-4][0-9])\)\s*[..、、]?.+?\s+\(([1-9]|[1-4][0-9])\)\s*[..、、]?.+?',
- # # "\n" + "\n".join(anss))]
- # # if len(sub_item_no) > 3:
- # # anss = re.sub(r'\n\s*\(([1-9]|[1-4][0-9])\)\s*[..、、]?', "\n" + r"【@\1、", "\n" + "\n".join(anss))
- # # anss = re.sub(r'(\n【@([1-9]|[1-4][0-9])、.+?\s+)\(([1-9]|[1-4][0-9])\)\s*[..、、]?', r"\1【@\3、", anss)
- # # anss = anss.replace("【@", "").split("\n")[1:]
- #
- # # --------- 答案整体解析----存在一行中有选择题和填空题答案,填空题答案尽量每题占一行----------
- # item_type_num = sum(list(item_type_classify.values()))
- # all_item_ans = []
- # table_ans = []
- # ans_no = []
- # # 默认表格答案放在最前面 !!!
- # while anss and "table" in anss[0]: # 答案以表格形式呈现, 表格应放在前两行位置,不要插在答案中间
- # row_list = [] # 要求表格形式为 横纵分明 ,不存在合并
- # for tt in re.finditer('<tr>(((?!(</?tr>)).)*)</tr>', anss[0], re.S): # 先划分每行
- # tt_list = re.split(r'</p></td>|<td><p>|</td><td>|</td>|<td>', tt.group(1)) # 再划分每列
- # # row_list.append([col for col in tt_list if col.strip()]) # 也有可能答案为空
- # row_list.append(tt_list)
- # if row_list:
- # print("^^^^^^存在答案放在表格里的情况!^^^^^^^")
- # if len(row_list) % 2 != 0:
- # print('表格形式呈现的答案不是偶数行')
- # else:
- # # print("row_list:", row_list)
- # for k, v in enumerate(row_list):
- # # print('-----',v)
- # if (k + 1) % 2 == 1: # 奇数行==》答案序号行
- # item_no = [int(i) if re.sub(r"[^\d]", "", i) else -1 for i in v]
- # item_no_st = [num for num, i in enumerate(item_no) if i != -1] # 可能开头是-1
- # ans_no.extend([i for i in item_no if i != -1]) # 表格序号
- # table_ans.extend(row_list[k + 1][item_no_st[0]: item_no_st[-1] + 1]) # 表格答案
- # anss = anss[1:]
- #
- # # 先按一行没有多个题答案的情况取答案
- # anss_str = table_label_cleal("\n" + "\n".join(anss))
- # if re.search("<table>.+?</table>", anss_str) is None:
- # anss_str = anss_str.split("</table>")[-1].replace("</div>", "")
- #
- # ans_item_no_type = 1 # 初步定义答案的题号就是第一种类型
- # # 将序号前面是大写字母或分号的情况,加空
- # anss_str = re.sub(r"([A-H])\s*[..](\s*([1-4][0-9]|[1-9])\s*[..、、])", r"\1 \2", anss_str)
- # anss_str = re.sub(r"([;;])(\s*([1-4][0-9]|[1-9])\s*[.、、])", r"\1 \2", anss_str)
- #
- # rest_item_split = re.split(r'\n+\s*[1-4][0-9]\s*[..、、]|\n+\s*[1-9]\s*[..、、]', anss_str)
- # if not rest_item_split[0]:
- # rest_item_split = rest_item_split[1:]
- #
- # all_item_ans.extend(table_ans)
- # all_item_ans.extend(rest_item_split)
- # print("表格答案:", table_ans)
- # # pprint(all_item_ans)
- # # ------------先按没有一行多个答案的情况-------------------
- # if item_type_classify and len(all_item_ans) == sum(list(item_type_classify.values())):
- # res1 = []
- # for num1, one_ans in enumerate(all_item_ans):
- # parse = only_parse_split(one_ans, item_res[num1]["type"], item_res[num1]['stem'])
- # res1.append(parse)
- # return res1, 1
- # elif not item_type_classify and len(all_item_ans) == len(item_res):
- # res1 = []
- # for num1, one_ans in enumerate(all_item_ans):
- # parse = only_parse_split(one_ans, item_res[num1]["type"], item_res[num1]['stem'])
- # res1.append(parse)
- # return res1, 1
- # else: # 答案个数与题目不一致时,再按一行多个答案处理(题目个数正常,答案个数比题目少时)
- # print('-----存在一行多个答案的情况-----')
- # all_item_ans = []
- # all_item_ans.extend(table_ans)
- # # 再按一行多个答案的情况取答案
- # manyans_oneline_split = re.split(r'\n\s*[1-4][0-9]\s*[..、、]|\n\s*[1-9]\s*[..、、]'
- # r'|(?<![::..、、+\-*/=])\s[1-4][0-9]\s*[..、、]|(?<![::..、、+\-*/=])\s[1-9]\s*[..、、]'
- # r'|\s{2,}[1-4][0-9]\s*[..、、]|\s{2,}[1-9]\s*[..、、]', anss_str)
- #
- # temp_no = re.findall(r'\n\s*([1-4][0-9]|[1-9])\s*[..、、]'
- # r'|(?<![::..、、+\-*/=])\s([1-4][0-9]|[1-9])\s*[..、、]|\s{2,}([1-4][0-9]|[1-9])\s*[..、、]', anss_str)
- # temp_no = [int("".join(i)) for i in temp_no]
- # # print("temp_no:",temp_no)
- # # print('manyans_oneline_split:', manyans_oneline_split, len(manyans_oneline_split))
- # if not temp_no and not all_item_ans: # 没有表格答案的情况,如1~10 ACBBD...
- # row_ans = re.findall("[A-Z](?<!\))", manyans_oneline_split[0].strip())
- # all_item_ans.extend(row_ans)
- # temp_no = re.findall("(\d)-(\d{1,2})", manyans_oneline_split[0])
- # for t in temp_no:
- # ans_no.extend(list(range(int(t[0]), int(t[1])+1)))
- # if row_ans:
- # manyans_oneline_split = []
- # elif temp_no and not manyans_oneline_split[0]:
- # manyans_oneline_split = manyans_oneline_split[1:]
- # ans_no.extend(temp_no)
- # elif re.match("A-Z", manyans_oneline_split[1].strip()) is None and \
- # len(re.findall("[A-Z](?<!\))", manyans_oneline_split[0].strip())) == len(item_res) - (len(manyans_oneline_split)-1):
- # print('第一行答案不是以题号形式一个个给出')
- # row_ans = re.findall("[A-Z](?<!\))", manyans_oneline_split[0].strip())
- # all_item_ans.extend(row_ans)
- # manyans_oneline_split = manyans_oneline_split[1:]
- # if temp_no and temp_no[0] > len(row_ans):
- # ans_no.extend(list(range(temp_no[0]-len(row_ans), temp_no[0])))
- # ans_no.extend(temp_no)
- # else:
- # print("答案序号有问题!!")
- # ans_no.extend(['']*len(row_ans))
- # ans_no.extend(temp_no)
- # # print("manyans_oneline_split:************")
- # # pprint(manyans_oneline_split)
- # print("ans_no:", ans_no)
- # all_item_ans.extend(manyans_oneline_split)
- # combine_res = stem_ans_struc_combine(item_type_classify, item_res, all_item_ans, ans_no, group)
- # # if not combine_res:
- # # return '答案数量与题干数量不一致,请检查题干和答案中的题号,是否有遗漏答案或答案格式不对;' \
- # # '答案中若存在一行多个答案时,保证每个题的答案间要留有多个空格!', 2
- #
- # return combine_res
- # def manyans_oneline_split(item_str, one_type_num):
- # """
- # 对一行多个答案的情况进行拆分,包含表格形式表示的答案,表格要求放在前面
- # :param item_str:
- # :param one_type_num:
- # :return:
- # """
- # all_item_ans = []
- # table_ans = []
- # ans_no = []
- # if item_str and "table" in item_str: # 答案以表格形式呈现, 表格应放在前两行位置,不要插在答案中间
- # row_list = [] # 要求表格形式为 横纵分明 ,不存在合并
- # for one_table in re.finditer('<table>(((?!(</?table>)).)*)</table>', item_str, re.S):
- # for tt in re.finditer(r'<tr>(((?!(</?tr>)).)*)</tr>', one_table.group(1), re.S):
- # tt_list = re.split(r'</p></td>|<td><p>|</td><td>|</td>|<td>', tt.group(1))
- # # row_list.append([col for col in tt_list if col.strip()]) # 也有可能答案为空
- # row_list.append(tt_list)
- # if row_list:
- # print("^^^^^^存在答案放在表格里的情况!^^^^^^^")
- # if len(row_list) % 2 != 0:
- # print('有表格形式呈现的答案不是偶数行')
- # else:
- # for k, v in enumerate(row_list):
- # if (k + 1) % 2 == 1: # 奇数行==》答案序号行
- # item_no = [int(i) if re.sub(r"[^\d]", "", i) else -1 for i in v]
- # item_no_st = [num for num, i in enumerate(item_no) if i != -1] # 可能开头是-1
- # # print(item_no_st)
- # ans_no.extend([i for i in item_no if i != -1]) # 表格序号
- # table_ans.extend(row_list[k + 1][item_no_st[0]: item_no_st[-1] + 1]) # 表格答案
- #
- # all_item_ans.extend(table_ans)
- # rest_item_str = item_str.split("</table>")[-1].replace("</div>", "").strip()
- # rest_item_split = re.split(r'\n+\s*\([1-9]\)\s*[..、、]?|\n+\s*\([1-4][0-9]\)\s*[..、、]?', rest_item_str) # (\d)形式
- # if len(rest_item_split) > 1:
- # if not rest_item_split[0]:
- # rest_item_split = rest_item_split[1:]
- # all_item_ans.extend(rest_item_split)
- # # print("初步all_item_ans:", all_item_ans) # 初步答案
- # print("table_ans:",table_ans)
- # if len(all_item_ans) == one_type_num:
- # ans_no.extend([int(no) for no in re.findall(r'\n+\s*\(([1-9]|[1-4][0-9])\)\s*[..、、]?', rest_item_str)])
- # return all_item_ans, ans_no
- # else:
- # all_item_ans = []
- # all_item_ans.extend(table_ans)
- # # 再按一行多个答案的情况取答案
- # manyans_oneline_split = re.split(r'\n\s*[1-4][0-9]\s*[..、、]|\n\s*[1-9]\s*[..、、]'
- # r'|(?<![::..、、])\s+[1-4][0-9]\s*[..、、](?!png)|(?<![::..、、])\s+[1-9]\s*[..、、](?!png)',
- # rest_item_str)
- # temp_no = re.findall(r'\n\s*([1-4][0-9]|[1-9])\s*[..、、]|(?<![::..、、])\s+([1-4][0-9]|[1-9])\s*[..、、](?!png)', rest_item_str)
- # temp_no = [int("".join(i)) for i in temp_no]
- # if not temp_no and not all_item_ans: # 没有表格答案的情况,如1~10 ACBBD...
- # row_ans = re.findall("[A-Z](?<!\))", manyans_oneline_split[0].strip())
- # all_item_ans.extend(row_ans)
- # temp_no = re.findall("(\d)-(\d{1,2})", manyans_oneline_split[0])
- # for t in temp_no:
- # ans_no.extend(list(range(int(t[0]), int(t[1])+1)))
- # if row_ans:
- # manyans_oneline_split = []
- # elif temp_no and not manyans_oneline_split[0]:
- # manyans_oneline_split = manyans_oneline_split[1:]
- # ans_no.extend(temp_no)
- # elif re.match("A-Z", manyans_oneline_split[1].strip()) is None and \
- # len(re.findall("[A-Z](?<!\))", manyans_oneline_split[0].strip())) == one_type_num - (
- # len(manyans_oneline_split) - 1): # 没有表格答案的情况,如1~10 ACBBD...
- # row_ans = re.findall("[A-Z](?<!\))", manyans_oneline_split[0].strip())
- # all_item_ans.extend(row_ans)
- # manyans_oneline_split = manyans_oneline_split[1:]
- # if temp_no and temp_no[0] > len(row_ans):
- # ans_no.extend(list(range(temp_no[0]-len(row_ans), temp_no[0])))
- # ans_no.extend(temp_no)
- # else:
- # print("答案序号有问题!!")
- # ans_no.extend(['']*len(row_ans))
- # ans_no.extend(temp_no)
- # # print("manyans_oneline_split:", manyans_oneline_split)
- # all_item_ans.extend(manyans_oneline_split)
- # # print("all_item_ans:", all_item_ans)
- # if len(all_item_ans) == one_type_num:
- # return all_item_ans, ans_no
|