12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849 |
- #!/usr/bin/env/python
- # -*- coding:utf-8 -*-
- import re
- from pprint import pprint
- def resplit(item_res):
- """
- 对题目的切分结果,判断是否还有题目切分到一起的情况,并进行继续拆分
- :param item_res:
- :return:
- """
- new_res = []
- for k, one_res in enumerate(item_res):
- p1 = re.search(r"((?<=\n)|(?<=</su[pb]>))\s*([1-9]|1[0-9])\s*[..、、].+?([是为有]|等于)[((]\s*[))]", one_res['content'], re.S)
- # 大题出现2个相同序号如16A、16B
- p2 = re.compile(r"\n\s*([1-9]|1[0-9])\s*[ABC]\s*[..、、].+?(求|试问|[是为等于]+多少)", re.S)
- if p1 and one_res['item_topic_name'].replace("题", "") in ['选择', '单选', '多选', '双向选择']: # 主要针对2个题合在一起的情况
- con1 = one_res['content'][:p1.start()]
- con2 = one_res['content'][p1.start():]
- upletter_num1 = re.findall("[A-E]\s*[..、、]|[A-E]\s*<imgsrc", con1)
- upletter_num1 = set([re.findall("[A-E]", i)[0] for i in upletter_num1])
- upletter_num2 = re.findall("[A-E]\s*[..、、]|[A-E]\s*<imgsrc", con2)
- upletter_num2 = set([re.findall("[A-E]", i)[0] for i in upletter_num2])
- if len(upletter_num1)>=3 and len(upletter_num2)>=3:
- new_one = one_res.copy()
- new_one['item_id'] = int(re.search("^\n*\s*([1-9]|1[0-9])\s*[..、、]", con2).group(1))
- one_res['content'] = con1
- new_one['content'] = con2
- new_res.append(one_res)
- new_res.append(new_one)
- else:
- new_res.append(one_res)
- elif re.search(p2, one_res['content']) and one_res['item_topic_name'].replace("题", "") not in ['选择', '单选', '多选', '双向选择', '填空']:
- temp = one_res['content']
- split_p = [i.start() for i in re.finditer(p2, one_res['content'])]
- one_res['content'] = temp[:split_p[0]]
- new_res.append(one_res)
- for i, j in zip(split_p, split_p[1:] + [None]):
- new_one = one_res.copy()
- new_one['content'] = temp[i:j]
- new_one['item_id'] = re.search("\n\s*([1-9]\s*[ABC]|1[0-9]\s*[ABC])", new_one['content'][:10]).group(1)
- new_res.append(new_one)
- else:
- new_res.append(one_res)
- return new_res
|