#!/usr/bin/env/python # -*- coding:utf-8 -*- import re, os import configs from utils.washutil import table_label_cleal import numpy as np from PIL import Image def option2block(option_con, item_no_type): """ 选择题选项切分 对于选项切分部分,最好也像题号一样先自我切分纠错,但这样老师如果手误打错了字母,可能就解析出错!!!!! :return: """ def del_table(ss): ss = re.sub(r"||||", "", ss.replace("

", " ")) return ss # print('***********',option_con) if '' in option_con and \ len(re.findall('', option_con.strip())) == 1: st_opt = re.search('

(A\s*[..、、::].+?|\(A\)\s*[..、、]?.+?)

', option_con.strip()).start() option_con = option_con.strip()[0:st_opt] + '\n' + del_table(option_con.strip()[st_opt:]) # print("option_con:", option_con) option_con = re.sub(r"

(A\s*[..、、::].+?|\(A\)\s*[..、、]?.+?)

\n*?\s*

\s*(A\s*[、、..::]|\(A\)\s*[、、..]?)(.+?)", r"\n【【\1】】\2", option_con, re.S) if re.search("\n\s*C", option_con) is None and re.search("\n\s*c", option_con): option_con = re.sub("\n\s*c", "\nC", option_con) option_con = re.sub(r"(\n\s*(]).)*?/>\s*)+?\s*)(A[、、..::].+?)", r"\1\n\4", option_con.strip()) con = re.sub(r"\n\s*([A-H])\s*[、、..::](.+?)", r"\n【【\1、】】\2", option_con.strip()) # 行首的A、不能考虑,故得用strip if item_no_type == 1 and len(re.findall(r'【【[A-H]\s*[..、、]】】', con)) <= 2 and \ len(re.findall(r'\([A-H]\)', con)) > 2: # 针对题干是第一种类型,选项是第二种类型的情况 item_no_type = 2 if item_no_type == 2: con = re.sub(r"\n\s*\(([A-Hc])\)\s*[、、..]?(.+?)", r"\n【【\1、】】\2", option_con) if item_no_type == 1: if len(re.findall(r'【【[A-H]\s*[..、、]】】', con)) <= 3: while re.search(r"\n\s*[A-H]\s*\s+)(?\s+)(? 1: stem_opt = table_label_cleal(con_list[0]) con_list = list(map(del_table, con_list[1:])) con_list.insert(0, stem_opt) # 题干中的表格不需要清洗 return con_list, con recur_n = 1 # 递归次数 def option_structure(one_item, con, ans, item_no_type, is_danti=0): """ 选择题选项拆分结构化 还需要判断一下 选项个数与题型的对应!!!! :return: """ global recur_n # print(con) # print('----------------------') if recur_n > 2: if 'options' not in one_item: one_item["errmsgs"].append("选项格式不正确") recur_n = 1 return one_item ans = re.sub("[;;.]+", "", ans) ans2 = [] for a in ans.split("#"): if 0 < len(a.replace(" ", "")) < 8: ans2.append("、".join(re.findall(r"[A-G]", a.replace("c", "C")))) one_item["key"] = "; ".join(ans2) options_rank = get_options_arrange(one_item["stem"]) # print("id:", one_item['item_id']) # print("options_rank:",options_rank) con_list, repl_con = option2block(con, item_no_type) # print(len(con_list), con_list) # print('----------------------------') # 初筛 if len(con_list) < 5: opt_letter = re.findall(r"【【([A-H])\s*[..、、]】】", repl_con) if opt_letter and opt_letter[0] == 'B' and re.search("\s*A\s*[..、、].+?$", con_list[0]): re_split = re.sub("()\s*A\s*[..、、](.+?)$", r"\1【【A、】】\2", con_list[0]) con_list[0] = re_split.split("【【A、】】")[0] con_list.insert(1, re_split.split("【【A、】】")[1]) if len(con_list) >= 5: pattern_1 = re.compile(r"\s([1-9]|1[0-9])[..、、].+?([是为有]|等于)[((]\s*[))]\n", re.S) pattern_2 = re.compile(r"\s\(([1-9]|1[0-9])\).+?([是为有]|等于)[((]\s*[))]\n", re.S) pattern_3 = re.compile(r"([是为有]|等于)[((]\s*[))]\n", re.S) # 第一个错误针对题目中没有答案解析的情况,不然就是选项切分错误 if not is_danti: if (item_no_type == 1 and any([True for op in con_list[1:] if re.search(pattern_1, op)])) or \ (item_no_type == 2 and any([True for op in con_list[1:] if re.search(pattern_2, op)])): one_item["errmsgs"].append("本题选项与下一题题干间没有换行符,请注意重新换行!!!") # 一般只有一题和上一题连在一起 one_item['spliterr_point'] = one_item['item_id'] return one_item elif any([True for op in con_list[1:] if re.search(pattern_3, op)]): one_item["errmsgs"].append("本题的下一题的题号有问题,请注意重新输入!!!") one_item['spliterr_point'] = one_item['item_id'] # ------------------------------------------------------------------------ aft_opt = [] # 针对选项后是题目图片的情况 if "\n" in con_list[-1]: ccon = re.split("\n+", con_list[-1]) while re.match("|\n)\s*$|\s+$", "", i) for i in con_list[1:]], "options_rank": options_rank, }) # , "options_num": len(con_list[1:]) else: # 初次选项拆分的错误判断 con_list = option_label_correct(opt_letter, con_list, repl_con) # double_l = [key for key, value in dict(Counter(opt_letter)).items() if value > 1] if type(con_list) == str: one_item["errmsgs"].append(con_list) return one_item else: # con_list = pic_transfer(con_list) if con_list: return dict(one_item, **{"stem": con_list[0], "options": [re.sub("(
|\n)\s*$|\s+$", "", i) for i in con_list[1:]], "options_rank": options_rank, }) # return dict(one_item, **dict(zip(["stem","A","B","C","D"], con_list))) else: # 选项可能放在表格中 is_fail = 0 con_list2 = re.split(r"\n+", con) errmsgs = "" if len(con_list2) == 2: # 选项是4个图片组成的情况 option_array = len(re.findall("(^|\n) 2: # 排列情况 options_rank = 1 elif option_array > 1: options_rank = 3 else: options_rank = 2 ims = con_list2[1].split("|\n)\s*$|\s+$", "", i) for i in con_list2[1:]], "options_rank": options_rank, }) else: errmsgs = """选项格式不正确,请改为: A.xxxx (换行或多空几格) B.xxx。 【注意】1>>选项和题干间要换行,选项不要放在表格中;2>>选项【如A.】重新手输; 3>>选项太长时,每项之间要换行,上一项的内容不要与下一项在同一行!!""" is_fail = 1 else: con_list3 = re.split(r"\n(?=|\n)\s*$|\s+$", "", i) for i in con_list3[1:]], "options_rank": options_rank, }) else: errmsgs = """选项格式不正确,请改为: A.xxxx (换行或多空几格) B.xxx。 【注意】1>>选项和题干间要换行,选项不要放在表格中;2>>选项【如A.】重新手输 3>>选项太长时,每项之间要换行,上一项的内容不要与下一项在同一行!!""" is_fail = 1 op_con = re.split("[((]\s*[))]", con)[-1] stem_con = "".join(re.split("[((]\s*[))]", con)[:-1])+"( )\n" if is_fail: if "table" in op_con: to_clean_con = re.findall('(((?!()).)*)
', op_con, re.S) if len(to_clean_con) == 1: op_con = re.sub("||", "", op_con) one_item = option_structure(one_item, stem_con+op_con, ans, item_no_type) else: aa = re.findall("[A-E]", op_con) if len(aa) == len(set(aa)) == 4: recur_n += 1 op_con = re.sub("([A-E])\s*(?![..、、])", r"\1、", op_con) one_item = option_structure(one_item, stem_con + op_con, ans, item_no_type) if 'options' not in one_item and "选项格式不正确" not in "".join(one_item["errmsgs"]): one_item["errmsgs"].append(errmsgs) return one_item def get_options_arrange(cont): """ 判断word中选项每行排版个数 :return: """ options_rank = 1 # 纵向排列 option_num = 0 if '' in cont: table_op = re.findall('.+?>([A-H]\s*[..、、].+?|\([A-Z]\)\s*[..、、]?.+?)', cont.strip()) if table_op: option_num = len(re.findall('[A-H]\s*[..、、].+?|\([A-Z]\)\s*[..、、]?.+?', table_op[0])) if option_num == 2: options_rank = 3 if option_num > 2: options_rank = 2 else: option_list = cont.split("\n") for op in option_list: if re.search("^\s*[A-H]\s*[..、、].+?|^\s*[A-H]\s*", opt): w_info1 = re.search(' style=".*?width: (\d+[.\d]*?)\s*([pxtin]*?);.*?"', img) w_info2 = re.search(' width="(\d+[.\d]*?)\s*([pxt]*?)"', img) if w_info1: if w_info1.group(2) == 'pt': pic_len.append((25.4/72)*float(w_info1.group(1))) elif w_info1.group(2) == 'px': pic_len.append((25.4 / 72) * (3/4) * float(w_info1.group(1))) elif w_info1.group(2) == 'in': pic_len.append(25.4 * float(w_info1.group(1))) elif w_info2: pic_len.append((25.4 / 72) * (3 / 4) * float(w_info2.group(1))) else: print("选项中存在图片宽高未知") # 主要没有宽高的图片是用户在编辑器新粘贴的图片,保存在本地,通过读取获取宽高 w_info3 = re.search('=*+$~%()\[\]{}\" ]", opt)) opt = re.sub(r"[a-z\d,.!?;'\-/:<>=*+$~%()\[\]{}\" ]", "", opt) char_zh_l = len(opt) char_len = (10.5/72)*25.4*(char_en_l*0.75+char_zh_l) option_len.append(sum(pic_len) + char_len) # 以最长的选项长度作为参考:<=6个中文字符则排成1行,<=15个中文字符则排成2排,否则都是纵向排列===>此逻辑不对 if sum(option_len) + (len(options)*2 + (len(options)-1)*4)*(10.5/72)*25.4 < 176-40: options_rank = 2 else: option_len = sorted(option_len, reverse=True) if option_len[0]+option_len[1] + (2*2+1*4)*(10.5/72)*25.4 <= 176-40: options_rank = 3 return options_rank def option_label_correct(opt_letter, con_list, con): """ 选项少切了会报错,所以优先解决多切的错误问题 纠正中标签错误的情况:选项字母不连续或重复; opt_letter:选项的字母 con_list:选择题拆分了选项的列表 """ lable_sign = re.findall(r"【【([A-H][..、、])】】", con.replace(" ", "")) con_list2 = con_list for i, j in enumerate(lable_sign): # 将con_list的选项字母加上 con_list[i + 1] = j + con_list[i + 1] # con_list2 = re.split(r"【【[A-H]\s*[..、、]】】", con) p1 = 0 # 选项在con_list中的起始位置 for k, v in enumerate(con_list[1:]): if re.search(r"[((]\s*[))]", v): # 选择题末尾一般都有() opt_letter[k] = '0' p1 = k + 2 if p1 and p1 < len(con_list[1:]): # '0'不在最后一个位置 option_list = con_list2[p1:] if len(option_list) >= 4: new_con_list = ["".join(con_list[:p1])].extend(option_list) return new_con_list else: # 只考虑ABCD和ACBD两种情况 label_str = "".join(opt_letter) if re.match("A","".join(opt_letter)) is None: label_str = re.sub("[^A]A", "0A", "".join(opt_letter), count=1) # print(label_str) # ------------------------------------------------------------- # 若选择题中没有(),题干中还是出现了AA的话,需要判断下是否存在错误 if re.search("AA", label_str): label_bcd_idx = [k for k, i in enumerate(label_str) if i != 'A'] label_a_idx = [k for k, i in enumerate(label_str) if i == 'A'] length_all = [] for i1 in label_bcd_idx: # 先将公式替换,作选项长度判断 l1 = len(re.sub(r").)+?/>", "", con_list2[i1+1]).replace(" ","")) length_all.append(l1) aver_length = np.mean(length_all) st_a = label_str.index("AA") for i2 in label_a_idx: l2 = len(re.sub(r").)+?/>", "", con_list2[i2+1]).replace(" ","")) if abs(l2 - aver_length) >= 12: if i2 >= st_a: st_a = i2+1 if st_a < len(label_str)-3: label_str = "".join(["0" if k < st_a else i for k, i in enumerate(label_str)]) # ----------------------------------------------------------------- label_str = re.sub("A[^BC]", "AA", label_str) label_str = re.sub("B[^CD]", "BB", label_str) label_str = re.sub("C[^BD]", "CC", label_str) label_str = re.sub("D[^E]", "DD", label_str) # 统计是否有重复的字符,若有,则进行合并,否则保持原来 new_con_list = [con_list[0]] local_w = 0 while local_w < len(label_str): if local_w == len(label_str) - 1 and label_str[local_w] == '0': break while label_str[local_w] == '0': # 如果‘0’在中间,则‘0’会被去除 local_w += 1 double_num = label_str.count(label_str[local_w]) if double_num >= 2: new_con_list.append(con_list2[local_w + 1] + "".join(con_list[2 + local_w:local_w+double_num + 1])) else: new_con_list.append(con_list2[local_w + 1]) local_w += double_num new_opt_letter = label_str.replace('AA',"A").replace('BB',"B").replace('CC',"C").replace('DD',"D") if len(new_con_list) >= 4: if "".join(sorted(new_opt_letter)) in "ABCDEFGHIJ" or "".join(sorted(new_opt_letter)) in ["ABCE", "ABDE", "ACDE", "BCDE"]: return new_con_list return "选项格式不正确,1、请改为: A.xxxx B.xxx,手动输入选项字母及后面的标点符号;" \ "2.第一个选项A与题干之间要换行,各选项按ABCD排序;3.选项含图片时用嵌入式;" def table_option_struc(stem): """ 表格类的选项结构化,在化学科目的选择题中较常见 :return: 表格仍然作为表格,选项则根据表格中的选项补充,如A、A B、B """ options = [] stem = re.sub('

\n+', "\n", stem) may_options = re.findall("

(((?!()).)*)
", stem) if may_options: options_data = may_options[-1][0] data_col = re.findall("(.*?)", options_data) # 第一列 if re.search("#?A#B#C#D#", re.sub("[..、、,,\s]|", "", "#".join(data_col).strip())+"#"): options_str = re.sub("[..、、,,\s]|", "", "#".join(data_col).strip()+"#") if "A#B#C#D#E#F#" not in options_str: if "A#B#C#D#E#" in options_str: options = ["A", "B", "C", "D", "E"] elif "A#B#C#D#" in options_str: options = ["A", "B", "C", "D"] else: data_rows = re.findall("(.*?)", options_data) if data_rows: data_row = re.findall("(.*?)", data_rows[0]) # 第一行 if re.search("#?A#B#C#D#", re.sub("[..、、,,\s]|", "", "#".join(data_row).strip()) + "#"): options_str = re.sub("[..、、,,\s]|", "", "#".join(data_row).strip() + "#") if "A#B#C#D#E#F#" not in options_str: if "A#B#C#D#E#" in options_str: options = ["A", "B", "C", "D", "E"] elif "A#B#C#D#" in options_str: options = ["A", "B", "C", "D"] return options if __name__ == '__main__': stem =""" 下列物质与危险化学品标志的对应关系不正确的是
ABCD
汽油天然气浓硫酸氢氧化钠
""" print(table_option_struc(stem))