#!/usr/bin/env/python
# -*- coding:utf-8 -*-
import re
from utils.item_type_line import get_item_head_info
from utils.topic_no import judge_item_no_type, get_right_no, pre_get_item_no
def stems_structure_byno(stem_con):
"""
按题号进行切分;
针对无解析的试卷中所有题目的拆分;
:return:{"content": , "item_id": , "errmsgs": [],"item_topic_name":,}
"""
try:
while stem_con and ((re.search(r"[\u4e00-\u9fa5]", stem_con[0]) is None) or
(re.search(r"[一二三四]\s*[、..、]\s*[^必考基础综合中等]{2,4}题", stem_con[0]) is None
and re.search(r"(^|\n)\s*[1-4][0-9]?\s*[..、、]((?!(答题卡|涂黑|本卷|2B铅笔|签字笔|密封线)).)*?$",
stem_con[0]) is None)):
del stem_con[0]
except:
return "本份试卷开头格式有问题,请按试卷格式来!"
stem_str = "\n" + "\n".join(stem_con)
# 题号格式有条件清洗
# def sub1(ss):
# if int(ss.group(5)) - int(ss.group(2)) in [1, 2]:
# return ss.group(1)+"\n"+ss.group(5)+"、"+ss.group(6)
# else:
# return ss.group(0)
# stem_str = re.sub(r"(\n\s*([1-4][0-9]|[1-9])\s*[..、、]((?!\n\s*([1-4][0-9]|[1-9])\s*[..、、]).)+?)\n+\s*([1-4][0-9]|[1-9])\s*(?![..、、])([^\s\d]+)", sub1, stem_str, flags=re.S)
# 也可以用下面方法,但比较啰嗦
while re.search(r"(\n\s*([1-4][0-9]|[1-9])\s*[..、、]((?!\n\s*([1-4][0-9]|[1-9])\s*[..、、]).)+?)\n+\s*([1-4][0-9]|[1-9])\s*(?![..、、])([^\s\d]+)", stem_str, re.S):
wrong_id_info = re.search(r"(\n\s*([1-4][0-9]|[1-9])\s*[..、、]((?!\n\s*([1-4][0-9]|[1-9])\s*[..、、]).)+?)\n+\s*([1-4][0-9]|[1-9])\s*(?![..、、])([^\s\d]+)", stem_str, re.S)
stem_str = stem_str.replace(wrong_id_info.group(0), wrong_id_info.group(1)+"\n"+wrong_id_info.group(5)+"、"+wrong_id_info.group(6))
# print(stem_str)
# 1、获取题型行信息、按题型行切分
con11, title_info_dict, choice_class = get_item_head_info(stem_str)
all_type = title_info_dict["all_type"]
title_type_num = title_info_dict["title_type_num"]
select_type_id = title_info_dict["select_type_id"]
each_item_score, each_item_score2 = title_info_dict["each_item_score"], title_info_dict["each_item_score2"]
# -------------------------------------------------------
res = []
item_type_classify = {} # 记录每类题型中含有的题目个数,含合并的情况
item_type_num = [] # 题型不合并
pic_no = {} # 记录每个题的图
new_item_no = [] # 将纠错后的题号再记录一份
if not all_type:
print("不存在大题题型行或题型行格式或名称有问题")
# 初步获取题号,题号类型
stem_str, item_no_info, item_no_type = judge_item_no_type(stem_str)
# 获取正确题号的位置,进行切分
new_item_no, items_no_idx = get_right_no(item_no_info)
one_item_split = [stem_str[i:j] for i, j in zip(items_no_idx, items_no_idx[1:] + [None])]
dd = {}
for n, one_item in enumerate(one_item_split):
dd["content"] = one_item
dd["item_id"] = new_item_no[n] # 题目本身的题号
dd["item_topic_name"] = "" # 先题型不备注,根据答案再看
dd["errmsgs"] = []
dd["score"] = 0
# if select_type_id and dd["item_id"] in select_type_id:
# dd['is_optional'] = 'true'
res.append(dd)
dd = {}
# 先不做拆图处理了
else:
# print(all_type, len(con11))
if len(all_type) == len(con11)-1: # 第一部分的题型行掉了
all_type.insert(0, "")
each_item_score.insert(0, 0) # 按题型行拿的分数也会掉,先补上默认的0
elif len(all_type) != len(con11):
print("第二种试卷格式:存在题型行没有换行") # 可能造成题目和题型行在同一行
# return "存在题型行末尾没有换行或题型行中题型不明确"
# else:
# # print(all_type)
# if "非选择题" in all_type:
# error_info1 = "第" + str(all_type.index("非选择题")+1) + "大题的题型不明确"
# if any([True for one_type in all_type if re.search("必考基础综合中等面列下各", one_type)]):
# error_info1 = "存在题型行中题型不明确"
# ---------------------------------------------------------------------
# 思路:>>>>先纠正题号,再拆分题目;等切分好后再纠正(删减添加操作)比较费时
# 按题号切分,可再加些细节!!!! 1>>题号不要求连续
# 初步判断题号类型
stem_str, item_no_info, item_no_type = judge_item_no_type(stem_str)
# >>>>切分题目
for num, one_type in enumerate(con11):
# 初步获取题号
item_no_info = pre_get_item_no("\n"+one_type, item_no_type)
# 获取正确题号的位置,进行切分
if res:
items_no_temp, items_no_idx = get_right_no(item_no_info, have_type=1, last_id=res[-1]["item_id"])
else:
items_no_temp, items_no_idx = get_right_no(item_no_info)
is_from_0 = 1
if items_no_temp and items_no_idx[0] != 0 and items_no_temp[0] > 1: # 以防出现题号漏了的情况
items_no_idx.insert(0, 0)
is_from_0 = 0
one_item_split = [("\n" + one_type)[i:j] for i, j in zip(items_no_idx, items_no_idx[1:] + [None])]
# ------------------每个大题的第一题前面的图可能有漏的情况------------------------
may_oimt_pic = []
if not is_from_0:
may_oimt_pic_info = re.search("\n((\s*)*?)\s*\n?$", one_item_split[0])
if may_oimt_pic_info:
if len(one_item_split) > 1 and re.search("如[上下左右]?图", one_item_split[1]):
may_oimt_pic.extend(re.findall("", may_oimt_pic_info.group(1)))
# print("may_oimt_pic:", may_oimt_pic)
# ------------------------------------------------------------------
if len(all_type) == len(con11):
if not is_from_0:
if all_type[num] and all_type[num].replace("题", "") not in ['选择', '单选', '多选']:
one_item_split = one_item_split[1:]
else: # 针对选择题第1题或前面几题题号漏了的情况
one_item_split1 = one_item_split[1:]
# ------针对分错的细节继续拆分,如上一题选项行与下一题题干没有换行-----------------
new_one_item_split = []
pattern_1 = re.compile(
r"([CDE]\s*[..、、].+?)(? 0.0:
dd["score"] = title_info_dict["total_score"][num]
res.append(dd)
# 多图在一起的情况进行拆分
if len(re.findall(r"第?\(?([1-9]|[1-4][0-9])\)?\s*题图", one_item)) > 1 and \
re.search(r"", pic_info.group(1))
pic_w = re.findall("([1-9]|[1-4][0-9])[))]?\s*(?=题)", pic_info.group(2))
if len(pic_list) >= len(pic_w):
pic_no = {int(p): pic_list[len(pic_list) - len(pic_w) + k] for k, p in enumerate(pic_w)}
dd["content"] = dd["content"].replace(pic_info.group(2), "")
for k, pic in enumerate(pic_list[::-1]):
if k < len(pic_w):
dd["content"] = dd["content"].replace(pic, "")
# -----------------------------------------------------
dd = {}
if pic_no:
for i in list(pic_no.keys()):
res[i-1]["content"] = res[i-1]["content"].strip() + "\n" + pic_no[i] + "\n" + "第" + str(i) + "题图"
# ----------------------------------------------------------------------
# --------最后判断一下题量是否正确-----------------------------------------
# 针对拆分后题量特别多的情况,将拆分后题量与已知题量一样的题目保留
new_res = []
right_type = [] # 记录与已知题目个数一样的分块{第几个分块,题型}
if all_type and sum(list(item_type_classify.values())) > 30: # 很可能存在大片不是题号的题号
title_type_num_all = sum([t[1] for t in title_type_num]) # 题型行给出的题目个数
if title_type_num_all > 0:
for idx, type_num in enumerate(item_type_num):
if type_num[1] == title_type_num[idx][1] > 0: # 同一题型是否题量一致
right_type.append((idx, type_num[0]))
elif title_type_num[idx][1] == 0 and type_num[1] == 1:
title_type_num[idx][1] = 1
right_type.append((idx, type_num[0]))
if right_type:
for rtype in right_type:
for idx, item in enumerate(res):
r_st = 0 if rtype[0]==0 else sum([t[1] for t in item_type_num[:rtype[0]]])
r_ed = r_st + item_type_num[rtype[0]][1]
if item["item_topic_name"] == rtype[1] and r_st<=idx