#!/usr/bin/env/python # -*- coding:utf-8 -*- import re, os import configs from utils.washutil import table_label_cleal import numpy as np from PIL import Image def option2block(option_con, item_no_type): """ 选择题选项切分 对于选项切分部分,最好也像题号一样先自我切分纠错,但这样老师如果手误打错了字母,可能就解析出错!!!!! :return: """ def del_table(ss): ss = re.sub(r"?t[dr]>|?tbody>|?table>|?div>|?p>", "", ss.replace("
", " ")) return ss # print('***********',option_con) if '
(A\s*[..、、::].+?|\(A\)\s*[..、、]?.+?) |
(A\s*[..、、::].+?|\(A\)\s*[..、、]?.+?) |
\s*(A\s*[、、..::]|\(A\)\s*[、、..]?)(.+?)", r"\n【【\1】】\2", option_con, re.S)
if re.search("\n\s*C", option_con) is None and re.search("\n\s*c", option_con):
option_con = re.sub("\n\s*c", "\nC", option_con)
option_con = re.sub(r"(\n\s*(\s*)+?\s*)(A[、、..::].+?)", r"\1\n\3", option_con.strip())
con = re.sub(r"\n\s*([A-H])\s*[、、..::](.+?)", r"\n【【\1、】】\2", option_con.strip()) # 行首的A、不能考虑,故得用strip
if item_no_type == 1 and len(re.findall(r'【【[A-H]\s*[..、、]】】', con)) <= 2 and \
len(re.findall(r'\([A-H]\)', con)) > 2: # 针对题干是第一种类型,选项是第二种类型的情况
item_no_type = 2
if item_no_type == 2:
con = re.sub(r"\n\s*\(([A-Hc])\)\s*[、、..]?(.+?)", r"\n【【\1、】】\2", option_con)
if item_no_type == 1:
if len(re.findall(r'【【[A-H]\s*[..、、]】】', con)) <= 3:
while re.search(r"\n\s*[A-H]\s*\s+)(?\s+)(? 1:
stem_opt = table_label_cleal(con_list[0])
con_list = list(map(del_table, con_list[1:]))
con_list.insert(0, stem_opt) # 题干中的表格不需要清洗
return con_list, con
recur_n = 1 # 递归次数
def option_structure(one_item, con, ans, item_no_type, is_danti=0):
"""
选择题选项拆分结构化
还需要判断一下 选项个数与题型的对应!!!!
:return:
"""
global recur_n
# print(con)
# print('----------------------')
if recur_n>2:
if 'options' not in one_item:
one_item["errmsgs"].append("选项格式不正确")
recur_n = 1
return one_item
ans = re.sub("[;;.]+", "", ans)
ans2 = []
for a in ans.split("#"):
if 0
|\n)\s*$|\s+$", "", i) for i in con_list[1:]],
"options_rank": options_rank,
})
# return dict(one_item, **dict(zip(["stem","A","B","C","D"], con_list)))
else:
# 选项可能放在表格中
is_fail = 0
con_list2 = re.split(r"\n+", con)
errmsgs = ""
if len(con_list2) == 2: # 选项是4个图片组成的情况
option_array = len(re.findall("(^|\n) 2: # 排列情况
options_rank = 1
elif option_array > 1:
options_rank = 3
else:
options_rank = 2
ims = con_list2[1].split("|\n)\s*$|\s+$", "", i) for i in con_list2[1:]],
"options_rank": options_rank,
})
else:
errmsgs = """选项格式不正确,请改为: A.xxxx B.xxx 或 (A)xxxx (B)xxx,全文选项和题号格式要统一。
【注意】1>>选项和题干间要换行,选项不要放在表格中;2>>选项【如A.】重新手输;3>>选项图片时用嵌入式;
4>>选项太长时,每项之间要换行,上一项的内容不要与下一项在同一行!!"""
is_fail = 1
else:
con_list3 = re.split(r"\n(?=|\n)\s*$|\s+$", "", i) for i in con_list3[1:]],
"options_rank": options_rank,
})
else:
errmsgs = """选项格式不正确,请改为: A.xxxx B.xxx 或 (A)xxxx (B)xxx,全文选项和题号格式要统一。
【注意】1>>选项和题干间要换行,选项不要放在表格中;2>>选项【如A.】重新手输;3>>选项图片时用嵌入式;
4>>选项太长时,每项之间要换行,上一项的内容不要与下一项在同一行!!"""
is_fail = 1
op_con = re.split("[((]\s*[))]", con)[-1]
stem_con = "".join(re.split("[((]\s*[))]", con)[:-1])+"( )\n"
if is_fail:
if "table" in op_con:
to_clean_con = re.findall('(((?!(?table>)).)*)
', op_con, re.S)
if len(to_clean_con) == 1:
op_con = re.sub("?table>|?tr>|?td>", "", op_con)
one_item = option_structure(one_item, stem_con+op_con, ans, item_no_type)
else:
aa = re.findall("[A-E]", op_con)
if len(aa)==len(set(aa)) == 4:
recur_n += 1
op_con = re.sub("([A-E])\s*(?![..、、])", r"\1、", op_con)
one_item = option_structure(one_item, stem_con + op_con, ans, item_no_type)
if 'options' not in one_item and "选项格式不正确" not in "".join(one_item["errmsgs"]):
one_item["errmsgs"].append(errmsgs)
return one_item
def get_options_arrange(cont):
"""
判断word中选项每行排版个数
:return:
"""
options_rank = 1 # 纵向排列
option_num = 0
if '' in cont:
table_op = re.findall(' .+?>([A-H]\s*[..、、].+?|\([A-Z]\)\s*[..、、]?.+?) ', cont.strip())
if table_op:
option_num = len(re.findall('[A-H]\s*[..、、].+?|\([A-Z]\)\s*[..、、]?.+?', table_op[0]))
if option_num == 2:
options_rank = 3
if option_num > 2:
options_rank = 2
else:
option_list = cont.split("\n")
for op in option_list:
if re.search("^\s*[A-H]\s*[..、、].+?|^\s*[A-H]\s*", opt):
w_info1 = re.search(' style=".*?width: (\d+[.\d]*?)\s*([pxtin]*?);.*?"', img)
w_info2 = re.search(' width="(\d+[.\d]*?)\s*([pxt]*?)"', img)
if w_info1:
if w_info1.group(2) == 'pt':
pic_len.append((25.4/72)*float(w_info1.group(1)))
elif w_info1.group(2) == 'px':
pic_len.append((25.4 / 72) * (3/4) * float(w_info1.group(1)))
elif w_info1.group(2) == 'in':
pic_len.append(25.4 * float(w_info1.group(1)))
elif w_info2:
pic_len.append((25.4 / 72) * (3 / 4) * float(w_info2.group(1)))
else:
print("选项中存在图片宽高未知")
# 主要没有宽高的图片是用户在编辑器新粘贴的图片,保存在本地,通过读取获取宽高
w_info3 = re.search('=*+$~%()\[\]{}\" ]", opt))
opt = re.sub(r"[a-z\d,.!?;'\-/:<>=*+$~%()\[\]{}\" ]", "", opt)
char_zh_l = len(opt)
char_len = (10.5/72)*25.4*(char_en_l*0.75+char_zh_l)
option_len.append(sum(pic_len) + char_len)
# 以最长的选项长度作为参考:<=6个中文字符则排成1行,<=15个中文字符则排成2排,否则都是纵向排列===>此逻辑不对
if sum(option_len) + (len(options)*2 + (len(options)-1)*4)*(10.5/72)*25.4 < 176-40:
options_rank = 2
else:
option_len = sorted(option_len, reverse=True)
if option_len[0]+option_len[1] + (2*2+1*4)*(10.5/72)*25.4 <= 176-40:
options_rank = 3
return options_rank
def option_label_correct(opt_letter, con_list, con):
"""
选项少切了会报错,所以优先解决多切的错误问题
纠正中标签错误的情况:选项字母不连续或重复;
opt_letter:选项的字母 con_list:选择题拆分了选项的列表
"""
lable_sign = re.findall(r"【【([A-H][..、、])】】", con.replace(" ", ""))
con_list2 = con_list
for i, j in enumerate(lable_sign): # 将con_list的选项字母加上
con_list[i + 1] = j + con_list[i + 1]
# con_list2 = re.split(r"【【[A-H]\s*[..、、]】】", con)
p1 = 0 # 选项在con_list中的起始位置
for k, v in enumerate(con_list[1:]):
if re.search(r"[((]\s*[))]", v): # 选择题末尾一般都有()
opt_letter[k] = '0'
p1 = k + 2
if p1 and p1 < len(con_list[1:]): # '0'不在最后一个位置
option_list = con_list2[p1:]
if len(option_list) >= 4:
new_con_list = ["".join(con_list[:p1])].extend(option_list)
return new_con_list
else: # 只考虑ABCD和ACBD两种情况
label_str = "".join(opt_letter)
if re.match("A","".join(opt_letter)) is None:
label_str = re.sub("[^A]A", "0A", "".join(opt_letter), count=1)
# print(label_str)
# -------------------------------------------------------------
# 若选择题中没有(),题干中还是出现了AA的话,需要判断下是否存在错误
if re.search("AA", label_str):
label_bcd_idx = [k for k, i in enumerate(label_str) if i != 'A']
label_a_idx = [k for k, i in enumerate(label_str) if i == 'A']
length_all = []
for i1 in label_bcd_idx: # 先将公式替换,作选项长度判断
l1 = len(re.sub(r").)+?/>", "", con_list2[i1+1]).replace(" ",""))
length_all.append(l1)
aver_length = np.mean(length_all)
st_a = label_str.index("AA")
for i2 in label_a_idx:
l2 = len(re.sub(r").)+?/>", "", con_list2[i2+1]).replace(" ",""))
if abs(l2 - aver_length) >= 12:
if i2 >= st_a:
st_a = i2+1
if st_a < len(label_str)-3:
label_str = "".join(["0" if k < st_a else i for k, i in enumerate(label_str)])
# -----------------------------------------------------------------
label_str = re.sub("A[^BC]", "AA", label_str)
label_str = re.sub("B[^CD]", "BB", label_str)
label_str = re.sub("C[^BD]", "CC", label_str)
label_str = re.sub("D[^E]", "DD", label_str)
# 统计是否有重复的字符,若有,则进行合并,否则保持原来
new_con_list = [con_list[0]]
local_w = 0
while local_w < len(label_str):
while label_str[local_w] == '0': # 如果‘0’在中间,则‘0’会被去除
local_w += 1
double_num = label_str.count(label_str[local_w])
if double_num >= 2:
new_con_list.append(con_list2[local_w + 1] + "".join(con_list[2 + local_w:local_w+double_num + 1]))
else:
new_con_list.append(con_list2[local_w + 1])
local_w += double_num
new_opt_letter = label_str.replace('AA',"A").replace('BB',"B").replace('CC',"C").replace('DD',"D")
if len(new_con_list) >= 4:
if "".join(sorted(new_opt_letter)) in "ABCDEFGHIJ" or "".join(sorted(new_opt_letter)) in ["ABCE", "ABDE", "ACDE", "BCDE"]:
return new_con_list
return "选项格式不正确,1、请改为: A.xxxx B.xxx,手动输入选项字母及后面的标点符号;" \
"2.第一个选项A与题干之间要换行,各选项按ABCD排序;3.选项含图片时用嵌入式;"
def table_option_struc(stem):
"""
表格类的选项结构化,在化学科目的选择题中较常见
:return: 表格仍然作为表格,选项则根据表格中的选项补充,如A、A B、B
"""
options = []
may_options = re.findall("(((?!(?table>)).)*)
", stem)
if may_options:
options_data = may_options[-1][0]
data_col = re.findall("(.*?) ", options_data) # 第一列
if re.search("#?A#B#C#D#", re.sub("[..、、,,\s]", "", "#".join(data_col).strip())+"#"):
options_str = re.sub("[..、、,,\s]", "", "#".join(data_col).strip()+"#")
if "A#B#C#D#E#F#" not in options_str:
if "A#B#C#D#E#" in options_str:
options = ["A", "B", "C", "D", "E"]
elif "A#B#C#D#" in options_str:
options = ["A", "B", "C", "D"]
else:
data_rows = re.findall("(.*?) ", options_data)
data_row = re.findall("(.*?) ", data_rows[0]) # 第一行
if re.search("#?A#B#C#D#", re.sub("[..、、,,\s]", "", "#".join(data_row).strip()) + "#"):
options_str = re.sub("[..、、,,\s]", "", "#".join(data_row).strip() + "#")
if "A#B#C#D#E#F#" not in options_str:
if "A#B#C#D#E#" in options_str:
options = ["A", "B", "C", "D", "E"]
elif "A#B#C#D#" in options_str:
options = ["A", "B", "C", "D"]
return options
if __name__ == '__main__':
stem ="""
下列物质与危险化学品标志的对应关系不正确的是
"""
print(table_option_struc(stem))
A B C D 汽油 天然气 浓硫酸 氢氧化钠