#!/usr/bin/env/python
# -*- coding:utf-8 -*-
# paper3_process: 第三类word试卷模式, 题目和答案分开的情况
# split2one_item:将所有行文本 按题型分大类,再在每个大类中切分每个题目
# split2one_item_by_topicno:将所有行文本 按题型分大类,再在每个大类中按题号切分每个题目
"""
总共3种方案:1、教师用卷;2、按题号切分;3、划分试题和答案,再按题号切分
"""
from structure.ans_structure import *
from utils.insert_keywords import get_con
from utils.item_resplit import resplit
from utils.washutil import table_label_cleal
from structure.stems_structure import stems_structure_byno
from utils.item_type_line import get_item_head_info
from utils.topic_no import judge_item_no_type, get_right_no
from utils.stem_ans_split import stem_ans_split
from collections import Counter
from pprint import pprint
def items_ans_reform(items_list, ans_list):
"""
第三种word试卷格式, 题目和答案分开的情况
答案也有几种类型:带题型?
:param sent_list:
:param split_point:
:return:
"""
con1 = list(filter(lambda x: x.strip() != "", items_list)) # 题目
anss1 = list(filter(lambda x: x.strip() != "", ans_list)) # 答案,list中的每个元素为一行
if re.match(".+?省.+?试[卷题]", con1[-1]):
con1 = con1[:-1]
if re.match(".+?省.+?试[卷题]|.*?答题?[卷卡页]", anss1[0]):
anss1 = anss1[1:]
print("-------答案页----------") # 答案页可能全是图片
# pprint(anss1)
print("-------答案页--end--------")
#--------------答案页也包含题目的情况----------------------------
ans_n = re.findall("【答案】", "\n".join(anss1))
if ans_n and len(ans_n) == len(re.findall("【解析】", "\n".join(anss1)))>2: # 带相同个数的答案和解析
item_res = split_by_keywords(anss1)
if type(item_res) != str:
return item_res
# ----------------- 【解析 题目】----------------------------
print('---------------解析 题目-------------------')
ress = stems_structure_byno(con1)
if type(ress) == str:
return ress
else:
item_res, all_type, item_type_classify, item_no_type, item_type_num, new_item_no = ress # 全题目(不含解析)的结构化
# 将空题目去掉
new_res = []
for k, sub_res in enumerate(item_res):
if sub_res['content'].strip():
sub_res['content'] = del_no(sub_res['content'])
new_res.append(sub_res)
item_res = new_res
# pprint(item_res)
# 先对题目的切分结果进行纠正!!!!!
item_res = resplit(item_res)
print("item_type_classify:", item_type_classify)
print("item_type_num:", item_type_num)
print('----------解析 答案---------------')
# -------------解析 答案---------------------------
# 分两种情况:1>>答案中又按题型排列, 如一、选择题 1.答案 2.答案
# 2>>答案中不含题型关键字,只按序号排列
# 3>>答案中不含题型关键字,且题目中也没有,all_type, item_type_classify为空
# print(anss1)
new_ans_no1 = []
rd1_is_fail = 0
have_type_line = re.search(r"[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,5}题", "\n".join(anss1))
if have_type_line:
# 这里的anss1的清洗不应该影响rd2_is_fail中的原始文本!!先不修改看看再说
while re.search(r"
[A-F] | |[A-F] | ", anss1[0]) is None and (re.search(r"[\u4e00-\u9fa5]", anss1[0]) is None
or re.search(r"[一二三四五六七八九十]\s*[、..、]\s*()?\s*.{2,5}题", anss1[0]) is None):
del anss1[0]
# 答案中的题型
all_type2 = re.findall(r"\n\s*[一二三四五六七八九十]\s*[、..、]\s*([^必考基础综合中等]{2,5}题)", "\n" + "\n".join(anss1))
# '本大题' 后面处理
print("答案中的题型:", all_type2)
ans_str = "\n" + "\n".join(anss1)
# try:
item_res, rd1_is_fail = anss_structure_with_type(item_res, ans_str, all_type, all_type2, item_type_num, item_type_classify)
# except:
# rd1_is_fail = 1
# 没有题型行或第一次解析失败
rd2_is_fail = 0
if not have_type_line or rd1_is_fail: # 答案中没有题型行 或题型行名称不规范
print('没有题型行或题目和答案的题型个数不一致或第一次解析失败')
anss1 = list(
map(lambda x: re.sub(r"(\n|^)\s*[一二三四五六七八九十]\s*[、..、]?\s*( )?"
r"(\s*.{2,5}题.+?分\s*[.。]?\s*$|.*?[((].+?[得共]\d+分.*?[))].*?$"
r"|\s*.{2,5}题\s*([((].+?[))])?).*?$|(\n|^)\s*.{2,5}题(.+?分\s*[))])?\s*$", "", x), anss1))
# print("anss1:", anss1)
raw_item_res = item_res
# try:
item_res = ans_structure_step1(anss1, item_type_classify, item_res) # 答案整体结构化
if str(raw_item_res) != str(item_res):
rd2_is_fail = 1
# except:
# rd2_is_fail = 1
# for i, one_item in enumerate(item_res):
# item_res[i].update({'answer': "", 'parse': ""})
# return item_res, item_no_type, rd2_is_fail
for i, one_item in enumerate(item_res):
if 'answer' not in one_item:
item_res[i]['answer'] = ""
if 'parse' not in one_item:
item_res[i]['parse'] = ""
return item_res, item_no_type, rd2_is_fail
def split_by_keywords(con_list):
"""
第一种试卷格式:教师用卷,含答案和解析关键字
切分思路:
1.根据大题型分,再按【答案|解析】初步拆分题目,再在‘解析’和‘答案’间细分‘题干’和‘解析’
:param con_list:
:return: 每个切分后的题目组成的dict
"""
# items_con = "\n" + "\n".join(con_list)
# judge_item_no_type(items_con)
# item_no_type = 1
# all_con = table_label_cleal()
# item_no = [int(no) for no in re.findall(r'\n+\s*([1-9][0-9]?)\s*[..、、]', all_con)]
# if len(item_no) <= 2:
# item_no_type = 2
# item_no = [int(no) for no in re.findall(r'\n+\s*[((]\s*([1-9][0-9]?)\s*[))]\s*[..、、]?', all_con)]
# if len(item_no) > 3:
# 去掉多余空格,作用不大
con2 = ["【delete】" if (k < len(con_list) - 1 and v.strip() == "" and (
re.match(r"【(答案|解析)】|(答案|解析)\s*[::]| 0 and v.strip() == "" and (
re.match(r"【(答案|解析)】$|(答案|解析)\s*[::]", con_list[k - 1].strip()) or
re.match(r"[a-z<>/\s]*[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题",
con_list[k - 1].strip())))
else v for k, v in enumerate(con_list)]
con3 = list(filter(lambda x: x != "【delete】", con2))
while con3 and con3[-1].strip() == "":
del con3[-1]
while con3 and con3[0].strip() == "":
del con3[0]
con3.append("") # 不然最后一个题就漏掉了
# 开头没用信息处理
con3[0] = re.sub(r"([一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题)", r"\n\1", con3[0])
while con3 and (re.search(r"[\u4e00-\u9fa5]", con3[0]) is None
or re.search(r"[一二三四五六七八九十]\s*[、..、]\s*[^必考基础综合中等]{2,4}题", con3[0]) is None):
del con3[0]
# ----------------------------------开始结构化---------------------------------------------
items_con = "\n" + "\n".join(con3)
# 初步获取题号,题号类型
items_con, item_no_info, item_no_type = judge_item_no_type(items_con)
# 1、获取题型行信息、按题型行切分
con4, title_info_dict, choice_class = get_item_head_info(items_con)
all_type = title_info_dict["all_type"]
select_type_id = title_info_dict["select_type_id"]
each_item_score, each_item_score2 = title_info_dict["each_item_score"], title_info_dict["each_item_score2"]
# 2、据是否有题型行分两步进行
# 没有做拆图处理
res = []
if not all_type:
print("不存在大题题型行或题型行格式有问题")
if len(re.findall(r"\n\s*【答案】", items_con)) != len(re.findall(r"\n\s*【解析】", items_con)):
return "不存在大题题型行或题型行格式有问题"
else:
item_no = []
subcon = re.split(r"((?<=\n)\s*【答案】|(?<=\n)\s*【解析】)\n?", items_con.strip())
pattern1 = re.compile(r"([1-9]|[1-4][0-9])\s*[..、、].+?")
if re.match(pattern1, subcon[0].strip()):
st_id = re.match(pattern1, subcon[0].strip()).group(1)
if int(st_id) > 1:
item_no.append(int(st_id))
else:
item_no.append(1)
if len(subcon) == 5: # 只有1道题
dd = dict(zip(["content", "answer", "parse"],
re.split(r"(?<=\n)\s*【答案】|(?<=\n)\s*【解析】", table_label_cleal(items_con))))
dd["item_topic_name"] = ""
dd["content"] = re.sub(r"^\d+\s*[..、、]", "", dd["content"][:5]) + dd["content"][5:]
dd["score"] = 0
dd["errmsgs"] = []
dd["item_id"] = item_no[0] # 要用实际id 不是索引序号
res.append(dd)
else:
# ------在下一题【解析】在本题【答案】之间找到下一题【content】的位置--------
all_item, local_item_no, errmsg_dict, count = get_con(subcon, item_no_type, index=0)
item_no.extend(local_item_no)
for idk, one_item in enumerate(all_item):
dd = dict(zip(["content", "answer", "parse"],
re.split(r"(?<=\n)\s*【答案】\n?|(?<=\n)\s*【解析】\n?",
table_label_cleal(one_item))))
dd["item_topic_name"] = ""
dd["content"] = re.sub(r"\d+\s*[..、、]", "", dd["content"][:5]) + dd["content"][5:]
dd["score"] = 0
dd["errmsgs"] = [errmsg_dict[idk]] if idk in errmsg_dict else []
dd["item_id"] = item_no[idk]
res.append(dd)
else:
if len(all_type) != len(con4):
print("存在题型行没有换行")
return "存在题型行末尾没有换行,请在所有题型行末尾重新换行" # 放第【2】种方案中进行处理
else:
# if "非选择题" in all_type:
# return "第" + str(all_type.index("非选择题")+1) + "大题的题型不明确"
index = 0
for num, one_type in enumerate(con4):
count = 1
if len(re.findall(r"\n\s*【答案】", one_type)) == len(re.findall(r"\n\s*【解析】", one_type)):
subcon = re.split(r"((?<=\n)\s*【答案】|(?<=\n)\s*【解析】)\n?", one_type.strip())
# index根据第一道题的题号进行纠正
item_no = []
pattern1 = re.compile(r"([1-9]|[1-4][0-9])\s*[..、、].+?")
if re.match(pattern1, subcon[0].strip()):
st_id = re.match(pattern1, subcon[0].strip()).group(1)
if num == 0 and int(st_id) != 1:
index = int(st_id) - 1
item_no.append(int(st_id))
else:
item_no.append(index+1)
if len(subcon) == 5: # 只有1道题
dd = dict(zip(["content", "answer", "parse"],
re.split(r"(?<=\n)\s*【答案】|(?<=\n)\s*【解析】", table_label_cleal(one_type))))
dd["item_topic_name"] = all_type[num]
dd["content"] = re.sub(r"^\d+\s*[..、、]", "", dd["content"][:5]) + dd["content"][5:]
dd["score"] = each_item_score[num]
dd["errmsgs"] = []
dd["item_id"] = item_no[0] # 要用实际id 不是索引序号
if not dd["score"] and each_item_score2 and str(dd["item_id"]) in each_item_score2.keys():
dd["score"] = each_item_score2[str(dd["item_id"])]
if select_type_id and dd["item_id"] in select_type_id:
dd['is_optional'] = 'true'
if dd["score"] == 0.0 and title_info_dict["total_score"][num] > 0.0:
dd["score"] = title_info_dict["total_score"][num]
res.append(dd)
else:
# ------在下一题【解析】在本题【答案】之间找到下一题【content】的位置,再按此3个关键字进行 切分--------
all_item, local_item_no, errmsg_dict, count = get_con(subcon, item_no_type, all_type=all_type, num=num, index=index)
item_no.extend(local_item_no)
for idk, one_item in enumerate(all_item):
dd = dict(zip(["content", "answer", "parse"],
re.split(r"(?<=\n)\s*【答案】\n?|(?<=\n)\s*【解析】\n?",
table_label_cleal(one_item))))
dd["item_topic_name"] = all_type[num]
dd["content"] = re.sub(r"\d+\s*[..、、]", "", dd["content"][:5]) + dd["content"][5:]
dd["score"] = each_item_score[num]
dd["errmsgs"] = [errmsg_dict[idk]] if idk in errmsg_dict else []
dd["item_id"] = item_no[idk] # idk+1+index 为序号
if choice_class:
for k, v in choice_class.items():
if dd["item_id"] in v:
dd["item_topic_name"] = k + "选题"
# elif len(choice_class) == 1:
# dd["item_topic_name"] = "多选题" if k == "单" else "单选题"
if not dd["score"] and each_item_score2 and str(dd["item_id"]) in each_item_score2.keys():
dd["score"] = each_item_score2[str(dd["item_id"])]
if select_type_id and dd["item_id"] in select_type_id:
dd['is_optional'] = 'true'
res.append(dd)
# pprint(res)
else:
return "第" + str(num + 1) + "大题《" + all_type[num] + "》中【答案】或【解析】格式有误或其中某道题中出现多个相同关键字或漏关键字"
index += count
return res, item_no_type
def split_by_topicno(con_list):
"""
第二种试卷格式: 不同时或都不含有{答案}和{解析}关键字
按题号切分每个题目
将所有行文本 按题型分大类,再在每个大类中切分每个题目
:param con_list: 所有行文本组成的list
:return: [{},{}]
"""
con1 = list(filter(lambda x: x.strip() != "", con_list))
ress = stems_structure_byno(con1) # 按题号切分后的初步结构化
if type(ress) == str:
return ress
else:
res, all_type, item_type_classify, item_no_type, item_type_num, new_item_no = ress
# res, all_type, item_type_classify = stems_structure_byno(con1)
print("item_type_num:", item_type_num)
# pprint(res)
# 可能存在有的题目有解析,有的没有
for k, one_res in enumerate(res):
if re.search('\n【(答案|[解分][析答]|详解|点[评睛]|考点|专题)】', one_res["content"]):
case = "case1" # 默认有“答案”关键字
if re.search(r'\n【答案】|答案\s*[::]', one_res["content"]) is None:
# 没“答案”关键字
case = "case0"
dd1 = stem_ans_split(one_res, case) # 对切分后的每道题再细分
one_res["content"] = dd1["content"]
del dd1["content"]
one_res.update(dd1)
else: # 没有解析的情况
one_res.update({"answer": "", "parse": ""})
one_res["content"] = del_no(one_res["content"], item_no_type)
if 'pic' in one_res:
one_res["content"] += "\n" + "\n".join(one_res["pic"])
del one_res["pic"]
# 先对题目的切分结果进行纠正!!!!!
res = resplit(res)
# 对最后一个题后面带个别答案(无答案页)
if res:
pattern1 = re.search('\n\s*([1-9]|[1-4][0-9])\s*[..、、]\s*(解\s*[::]|【解析|【答案)', res[-1]["content"])
if pattern1:
breakp = pattern1.start()
ans_str = res[-1]["content"][breakp:]
ans_no_info = pre_get_item_no(ans_str, item_no_type)
ans_no, ans_no_idx = get_right_no(ans_no_info)
all_ans = [del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])]
res[-1]["content"] = res[-1]["content"][:breakp]
res = get_ans_match(res, all_ans, ans_no)
else:
ans_str = res[-1]["content"] + res[-1]["parse"]
ans_no_info = pre_get_item_no(ans_str, item_no_type)
ans_no, ans_no_idx = get_right_no(ans_no_info)
if len(ans_no) == len(res):
all_ans = [del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])]
res[-1]["content"] = res[-1]["content"][:ans_no_idx[0]]
res = get_ans_match(res, all_ans, ans_no)
elif ans_no_idx:
try:
ans_no1, table_ans, st = get_table_ans(res[-1]["content"][:ans_no_idx[0]], flag=1)
if table_ans and 0 < ans_no[0] - ans_no1[-1] < 3:
all_ans = table_ans
all_ans.extend([del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])])
new_ans_no = ans_no1
new_ans_no.extend(ans_no)
if st >= 0:
res[-1]["content"] = res[-1]["content"][:st]
else:
res[-1]["content"] = res[-1]["content"][:ans_no_idx[0]]
res = get_ans_match(res, all_ans, new_ans_no)
except:
if len(ans_no)>4:
all_ans = [del_no(ans_str[i:j]) for i, j in zip(ans_no_idx, ans_no_idx[1:] + [None])]
res[-1]["content"] = res[-1]["content"][:ans_no_idx[0]]
res = get_ans_match(res, all_ans, ans_no)
# 没有识别出答案切分点的情况,很可能答案里的部分也当成题文进行拆分,所以先判断下是否有相同的id
all_no = [one_res['item_id'] for one_res in res]
if len(list(set(all_no))) - len(all_no) < -2:
Count_no = sorted(dict(Counter(all_no)).items(), key=lambda d: d[1], reverse=True)
if Count_no[0][1] > 1:
split_idx = [i for i, no in enumerate(all_no) if no == Count_no[0][0]][1]
for one_res in res[split_idx:]:
if re.search("[((]\s+[))]|(等于|存在|[是有为])多少|求.*?[??]",
one_res["content"] + "\n" + one_res["parse"]) is None:
bef_no = [k for k, j in enumerate(res[:split_idx]) if j["item_id"]==one_res["item_id"]]
if bef_no and not res[:split_idx][bef_no[0]]["parse"]:
res[:split_idx][bef_no[0]]["parse"] = one_res["content"] + "\n" + one_res["parse"]
return res[:split_idx],item_no_type
return res, item_no_type
|