insert_keywords.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. #!/usr/bin/env/python
  2. # -*- coding:utf-8 -*-
  3. import re
  4. def get_con(subcon, item_no_type, **kwargs):
  5. """
  6. # ------在下一题【解析】在本题【答案】之间找到下一题【content】的位置--------
  7. # 主要统计题号的位置,空行信息作为辅助
  8. :return:
  9. """
  10. errmsg_dict = {}
  11. item_no = []
  12. type_info = kwargs['all_type'][kwargs['num']] if 'all_type' in kwargs else ""
  13. index = kwargs['index']
  14. errmsg_bef = type_info+"第{0}道题(在整篇文档中为第{1}题)的题文和上一题的解析之间" if type_info \
  15. else "整篇文档中第{0}题的题文和上一题的解析之间{1}"
  16. count = 1 # 题量个数
  17. for id in range(4, len(subcon), 4):
  18. if id < len(subcon) - 1: # 最后一个不用管
  19. count += 1 # 当前在本大题中是第几个
  20. ssub = subcon[id].strip().split("\n") # 首尾空行先去掉
  21. blank_line = [i for i, v in enumerate(ssub) if v.strip() == ""] # 空格的索引
  22. # 索引to题号字典,获取可能的题号的位置
  23. pattern1 = r"([1-9]|[1-4][0-9])\s*[..、、]" if item_no_type==1 else r"\(([1-9]|[1-4][0-9])\)\s*[..、、]?"
  24. line_topicno_dict = {i: re.match(pattern1, v.strip()).group(1)
  25. for i, v in enumerate(ssub)
  26. if re.match(pattern1, v.strip())}
  27. # print("line_topicno_dict",line_topicno_dict)
  28. con_id_line = list(line_topicno_dict.keys()) # 题号的行索引,第几行
  29. topicno = list(line_topicno_dict.values()) # 题号序列
  30. topicno_line_dict = dict(zip(topicno, con_id_line)) # 题号to行索引字典
  31. if len(con_id_line) != len(topicno_line_dict):
  32. # 相同序号不是题目序号时
  33. from collections import Counter
  34. topicno_set = [i for i, j in dict(Counter(topicno)).items() if j == 1]
  35. if len(topicno_set) == 1 and 0 <= int(topicno_set[0]) - (item_no[-1] + 1) <= 1:
  36. ssub.insert(topicno_line_dict[topicno_set[0]], "【content】")
  37. item_no.append(int(topicno_set[0]))
  38. else:
  39. item_no.append(item_no[-1] + 1)
  40. errmsg = "【多个相同的题目序号或题目序号有误】"
  41. errmsg = errmsg_bef.format(str(count), str(index + count)) + errmsg if type_info \
  42. else errmsg_bef.format(str(index + count), "") + errmsg
  43. errmsg_dict[id / 4 - 1] = errmsg if (id / 4 - 1) not in errmsg_dict \
  44. else errmsg_dict[id / 4 - 1] + ";" + errmsg
  45. errmsg_dict[id / 4] = errmsg if id / 4 not in errmsg_dict \
  46. else errmsg_dict[id / 4] + ";" + errmsg
  47. # 应取中间偏下方的序号为准!!!
  48. if len(blank_line) == 1: # 大概率空格处
  49. ssub[blank_line[0]] = "【content】"
  50. else:
  51. intervals = [(len("".join(ssub[0:r])), len("".join(ssub[r:]))) for r in
  52. con_id_line]
  53. intervals = [k for k, r in enumerate(intervals) if
  54. r[0] > 100 and 50 <= r[1] <= 200]
  55. if len(intervals) == 1:
  56. print("【多个相同的题目序号】切分不严谨")
  57. ssub.insert(con_id_line[intervals[0]], "【content】")
  58. else:
  59. ssub = ["", "【content】"].extend(ssub) # 分不出来,先替换
  60. else:
  61. if len(con_id_line) == 1: # 一个题号
  62. if len(blank_line) == 1 and con_id_line[0] - blank_line[0] == 1: # 空格在前,题号在后
  63. ssub.insert(con_id_line[0], "【content】")
  64. item_no.append(int(topicno[0]))
  65. else:
  66. if 0 <= int(topicno[0]) - (item_no[-1] + 1) <= 1: # 允许题号相差1个
  67. ssub.insert(con_id_line[0], "【content】")
  68. item_no.append(int(topicno[0]))
  69. else:
  70. ssub[blank_line[0]] = "【content】" # 该题序号不对时再考虑空行
  71. item_no.append(item_no[-1] + 1)
  72. errmsg = "【题目序号不连续】"
  73. errmsg = errmsg_bef.format(str(count), str(index + count)) + errmsg if type_info \
  74. else errmsg_bef.format(str(index + count), "") + errmsg
  75. errmsg_dict[id / 4 - 1] = errmsg if (id / 4 - 1) not in errmsg_dict \
  76. else errmsg_dict[id / 4 - 1] + ";" + errmsg
  77. errmsg_dict[id / 4] = errmsg if id / 4 not in errmsg_dict \
  78. else errmsg_dict[id / 4] + ";" + errmsg
  79. elif len(con_id_line) > 1: # 多个题号时
  80. if str(item_no[-1] + 1) in topicno:
  81. item_no.append(item_no[-1] + 1)
  82. ssub.insert(topicno_line_dict[str(item_no[-1] + 1)], "【content】")
  83. elif str(item_no[-1] + 2) in topicno:
  84. item_no.append(item_no[-1] + 2)
  85. ssub.insert(topicno_line_dict[str(item_no[-1] + 2)], "【content】")
  86. else:
  87. item_no.append(item_no[-1] + 1)
  88. ssub = ["", "【content】"].extend(ssub) # 分不出来,先替换
  89. errmsg = "【题目序号不连续】"
  90. errmsg = errmsg_bef.format(str(count), str(index + count)) + errmsg if type_info \
  91. else errmsg_bef.format(str(index + count), "") + errmsg
  92. errmsg_dict[id / 4 - 1] = errmsg if (id / 4 - 1) not in errmsg_dict \
  93. else errmsg_dict[id / 4 - 1] + ";" + errmsg
  94. errmsg_dict[id / 4] = errmsg if id / 4 not in errmsg_dict \
  95. else errmsg_dict[id / 4] + ";" + errmsg
  96. else: # 无题号
  97. item_no.append(item_no[-1] + 1)
  98. errmsg = "【没有题目序号】"
  99. errmsg = errmsg_bef.format(str(count), str(index + count)) + errmsg if type_info \
  100. else errmsg_bef.format(str(index + count), "") + errmsg
  101. errmsg_dict[id / 4 - 1] = errmsg if (id / 4 - 1) not in errmsg_dict \
  102. else errmsg_dict[id / 4 - 1] + ";" + errmsg
  103. errmsg_dict[id / 4] = errmsg if id / 4 not in errmsg_dict \
  104. else errmsg_dict[id / 4] + ";" + errmsg
  105. if len(blank_line) == 1:
  106. ssub[blank_line[0]] = "【content】"
  107. else:
  108. ssub = ["", "【content】"].extend(ssub) # 分不出来,先替换
  109. subcon[id] = "\n".join(ssub)
  110. all_item = re.split(r"【content】", "\n".join(subcon).strip())
  111. return all_item, item_no, errmsg_dict, count