#!/usr/bin/env/python # -*- coding:utf-8 -*- import re from utils.washutil import table_label_cleal import numpy as np def option2block(option_con, item_no_type): """ 选择题选项切分 对于选项切分部分,最好也像题号一样先自我切分纠错,但这样老师如果手误打错了字母,可能就解析出错!!!!! :return: """ def del_table(ss): ss = re.sub(r"?t[dr]>|?tbody>|?table>|?div>|?p>", "", ss.replace("
", " ")) return ss # print('***********',option_con) if '
(A\s*[..、、].+?|\(A\)\s*[..、、]?.+?) |
(A\s*[..、、].+?|\(A\)\s*[..、、]?.+?) |
\s*(A\s*[、、..]|\(A\)\s*[、、..]?)(.+?)", r"\n【【\1】】\2", option_con, re.S)
if re.search("\n\s*C", option_con) is None and re.search("\n\s*c", option_con):
option_con = re.sub("\n\s*c", "\nC", option_con)
option_con = re.sub(r"(\n\s*(\s*)+?\s*)(A[、、..].+?)", r"\1\n\3", option_con.strip())
con = re.sub(r"\n\s*([A-H])\s*[、、..](.+?)", r"\n【【\1、】】\2", option_con.strip()) # 行首的A、不能考虑,故得用strip
if item_no_type == 1 and len(re.findall(r'【【[A-H]\s*[..、、]】】', con)) <= 2 and \
len(re.findall(r'\([A-H]\)', con)) > 2: # 针对题干是第一种类型,选项是第二种类型的情况
item_no_type = 2
if item_no_type == 2:
con = re.sub(r"\n\s*\(([A-Hc])\)\s*[、、..]?(.+?)", r"\n【【\1、】】\2", option_con)
if item_no_type == 1:
if len(re.findall(r'【【[A-H]\s*[..、、]】】', con)) <= 3:
while re.search(r"\n\s*[A-H]\s*\s+)(?\s+)(? 1:
stem_opt = table_label_cleal(con_list[0])
con_list = list(map(del_table, con_list[1:]))
con_list.insert(0, stem_opt) # 题干中的表格不需要清洗
return con_list, con
recur_n = 1 # 递归次数
def option_structure(one_item, con, ans, item_no_type):
"""
选择题选项拆分结构化
还需要判断一下 选项个数与题型的对应!!!!
:return:
"""
global recur_n
# print(con)
# print('----------------------')
if recur_n>2:
if 'options' not in one_item:
one_item["errmsgs"].append("选项格式不正确")
recur_n = 1
return one_item
ans = re.sub("[;;.]+", "", ans)
ans2 = []
for a in ans.split("#"):
if 0
|\n)\s*$|\s+$", "", i) for i in con_list[1:]],
})
# return dict(one_item, **dict(zip(["content","A","B","C","D"], con_list)))
else:
# 选项可能放在表格中
is_fail = 0
con_list2 = re.split(r"\n+", con)
errmsgs = ""
if len(con_list2) == 2: # 选项是4个图片组成的情况
option_array = len(re.findall("(^|\n) 2: # 排列情况
options_rank = 1
elif option_array > 1:
options_rank = 3
else:
options_rank = 2
con_list2 = [con_list2[0] if k == 0 else "|\n)\s*$|\s+$", "", i) for i in con_list2[1:]],
})
else:
errmsgs = """选项格式不正确,请改为: A.xxxx B.xxx 或 (A)xxxx (B)xxx,全文选项和题号格式要统一。
【注意】1>>选项和题干间要换行,选项不要放在表格中;2>>选项【如A.】重新手输;3>>选项图片时用嵌入式;
4>>选项太长时,每项之间要换行,上一项的内容不要与下一项在同一行!!"""
is_fail = 1
else:
con_list3 = re.split(r"\n|\n)\s*$|\s+$", "", i) for i in con_list3[1:]],
})
else:
errmsgs = """选项格式不正确,请改为: A.xxxx B.xxx 或 (A)xxxx (B)xxx,全文选项和题号格式要统一。
【注意】1>>选项和题干间要换行,选项不要放在表格中;2>>选项【如A.】重新手输;3>>选项图片时用嵌入式;
4>>选项太长时,每项之间要换行,上一项的内容不要与下一项在同一行!!"""
is_fail = 1
op_con = re.split("[((]\s*[))]", con)[-1]
stem_con = "".join(re.split("[((]\s*[))]", con)[:-1])+"( )\n"
if is_fail:
if "table" in op_con:
to_clean_con = re.findall('(((?!(?table>)).)*)
', op_con, re.S)
if len(to_clean_con) == 1:
op_con = re.sub("?table>|?tr>|?td>", "", op_con)
one_item = option_structure(one_item, stem_con+op_con, ans, item_no_type)
else:
aa = re.findall("[A-E]", op_con)
if len(aa)==len(set(aa)) == 4:
recur_n += 1
op_con = re.sub("([A-E])\s*(?![..、、])", r"\1、", op_con)
one_item = option_structure(one_item, stem_con + op_con, ans, item_no_type)
if 'options' not in one_item and "选项格式不正确" not in "".join(one_item["errmsgs"]):
one_item["errmsgs"].append(errmsgs)
return one_item
def get_options_arrange(cont):
"""
判断word中选项每行排版个数
:return:
"""
options_rank = 1 # 纵向排列
option_num = 0
if '' in cont:
table_op = re.findall(' .+?>([A-H]\s*[..、、].+?|\([A-Z]\)\s*[..、、]?.+?) ', cont.strip())
if table_op:
option_num = len(re.findall('[A-H]\s*[..、、].+?|\([A-Z]\)\s*[..、、]?.+?', table_op[0]))
if option_num == 2:
options_rank = 3
if option_num > 2:
options_rank = 2
else:
option_list = cont.split("\n")
for op in option_list:
if re.search("^\s*[A-H]\s*[..、、].+?|^\s*[A-H]\s*= 4:
new_con_list = ["".join(con_list[:p1])].extend(option_list)
return new_con_list
else: # 只考虑ABCD和ACBD两种情况
label_str = "".join(opt_letter)
if re.match("A","".join(opt_letter)) is None:
label_str = re.sub("[^A]A", "0A", "".join(opt_letter), count=1)
# print(label_str)
# -------------------------------------------------------------
# 若选择题中没有(),题干中还是出现了AA的话,需要判断下是否存在错误
if re.search("AA", label_str):
label_bcd_idx = [k for k, i in enumerate(label_str) if i != 'A']
label_a_idx = [k for k, i in enumerate(label_str) if i == 'A']
length_all = []
for i1 in label_bcd_idx: # 先将公式替换,作选项长度判断
l1 = len(re.sub(r").)+?/>", "", con_list2[i1+1]).replace(" ",""))
length_all.append(l1)
aver_length = np.mean(length_all)
st_a = label_str.index("AA")
for i2 in label_a_idx:
l2 = len(re.sub(r").)+?/>", "", con_list2[i2+1]).replace(" ",""))
if abs(l2 - aver_length) >= 12:
if i2 >= st_a:
st_a = i2+1
if st_a < len(label_str)-3:
label_str = "".join(["0" if k < st_a else i for k, i in enumerate(label_str)])
# -----------------------------------------------------------------
label_str = re.sub("A[^BC]", "AA", label_str)
label_str = re.sub("B[^CD]", "BB", label_str)
label_str = re.sub("C[^BD]", "CC", label_str)
label_str = re.sub("D[^E]", "DD", label_str)
# 统计是否有重复的字符,若有,则进行合并,否则保持原来
new_con_list = [con_list[0]]
local_w = 0
while local_w < len(label_str):
while label_str[local_w] == '0': # 如果‘0’在中间,则‘0’会被去除
local_w += 1
double_num = label_str.count(label_str[local_w])
if double_num >= 2:
new_con_list.append(con_list2[local_w + 1] + "".join(con_list[2 + local_w:local_w+double_num + 1]))
else:
new_con_list.append(con_list2[local_w + 1])
local_w += double_num
new_opt_letter = label_str.replace('AA',"A").replace('BB',"B").replace('CC',"C").replace('DD',"D")
if len(new_con_list) >= 4:
if "".join(sorted(new_opt_letter)) in "ABCDEFGHIJ" or "".join(sorted(new_opt_letter)) in ["ABCE", "ABDE", "ACDE", "BCDE"]:
return new_con_list
return "选项格式不正确,1、请改为: A.xxxx B.xxx,手动输入选项字母及后面的标点符号;" \
"2.第一个选项A与题干之间要换行,各选项按ABCD排序;3.选项含图片时用嵌入式;"