item_resplit.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. import re
  4. from pprint import pprint
  5. def resplit(item_res):
  6. """
  7. 对题目的切分结果,判断是否还有题目切分到一起的情况,并进行继续拆分
  8. :param item_res:
  9. :return:
  10. """
  11. new_res = []
  12. for k, one_res in enumerate(item_res):
  13. p1 = re.search(r"((?<=\n)|(?<=</su[pb]>))\s*([1-9]|1[0-9])\s*[..、、].+?([是为有]|等于)[((]\s*[))]", one_res['stem'], re.S)
  14. # 大题出现2个相同序号如16A、16B
  15. p2 = re.compile(r"\n\s*([1-9]|1[0-9])\s*[ABC]\s*[..、、].+?(求|试问|[是为等于]+多少)", re.S)
  16. if p1 and one_res['type'].replace("题", "") in ['选择', '单选', '多选', '不定选择']: # 主要针对2个题合在一起的情况
  17. con1 = one_res['stem'][:p1.start()]
  18. con2 = one_res['stem'][p1.start():]
  19. upletter_num1 = re.findall("[A-E]\s*[..、、]|[A-E]\s*<imgsrc", con1)
  20. upletter_num1 = set([re.findall("[A-E]", i)[0] for i in upletter_num1])
  21. upletter_num2 = re.findall("[A-E]\s*[..、、]|[A-E]\s*<imgsrc", con2)
  22. upletter_num2 = set([re.findall("[A-E]", i)[0] for i in upletter_num2])
  23. if len(upletter_num1)>=3 and len(upletter_num2)>=3:
  24. new_one = one_res.copy()
  25. new_one_list = re.split("^\n*\s*([1-9]|1[0-9])\s*[..、、]", con2)
  26. if len(new_one_list) == 3:
  27. new_one['item_id'] = int(new_one_list[1])
  28. new_one['stem'] = new_one_list[2]
  29. one_res['stem'] = con1
  30. new_res.append(one_res)
  31. new_res.append(new_one)
  32. else:
  33. new_res.append(one_res)
  34. else:
  35. new_res.append(one_res)
  36. elif re.search(p2, one_res['stem']) and one_res['type'].replace("题", "") not in ['选择', '单选', '多选', '不定选择', '填空']:
  37. temp = one_res['stem']
  38. split_p = [i.start() for i in re.finditer(p2, one_res['stem'])]
  39. one_res['stem'] = temp[:split_p[0]]
  40. new_res.append(one_res)
  41. for i, j in zip(split_p, split_p[1:] + [None]):
  42. new_one = one_res.copy()
  43. new_one['stem'] = temp[i:j]
  44. new_one['item_id'] = re.search("\n\s*([1-9]\s*[ABC]|1[0-9]\s*[ABC])", new_one['stem'][:10]).group(1)
  45. new_res.append(new_one)
  46. else:
  47. new_res.append(one_res)
  48. return new_res