sheet_points_by_nlp.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. # File:get_sheet_points_by_nlp.py
  2. # Author:lynn
  3. # Date:2020/5/19 18:23
  4. import ast, cv2, re
  5. from segment.sheet_resolve.analysis.sheet.tag_parse import TagParse
  6. import CRFPP
  7. from segment.sheet_resolve.tools.utils import crop_region
  8. try:
  9. import xml.etree.cElementTree as ET
  10. except ImportError:
  11. import xml.etree.ElementTree as ET
  12. def decide_coordinate_full_contains(coordinate1, coordinate2):
  13. xmin1 = coordinate1[0]
  14. ymin1 = coordinate1[1]
  15. xmax1 = coordinate1[2]
  16. ymax1 = coordinate1[3]
  17. mid_x = int(xmin1 + (xmax1 - xmin1)//2)
  18. mid_y = int(ymin1 + (ymax1 - ymin1)//2)
  19. xmin2 = coordinate2[0]
  20. ymin2 = coordinate2[1]
  21. xmax2 = coordinate2[2]
  22. ymax2 = coordinate2[3]
  23. if xmin1 <= xmin2 and ymin1 <= ymin2 and xmax1 >= xmax2 and ymax1 >= ymax2:
  24. return True
  25. else:
  26. return False
  27. def analyse_solve_solve0_result(result, ele):
  28. key_words = ['T', 'C', 'S', 'N', 'O', 'E']
  29. key_words_list = [ele[1] for ele in result]
  30. move_m_list = [ele for ele in key_words_list if ele != 'M']
  31. new_list = []
  32. if 'O' not in key_words_list:
  33. if 'N' in move_m_list and 'T' in move_m_list:
  34. index_n = key_words_list.index('N')
  35. index_t = key_words_list.index('T')
  36. numeber_str = result[index_n][0]
  37. number_pattern = re.findall('\d+', numeber_str)
  38. title_number = int(number_pattern[0])
  39. ele['number'] = title_number
  40. total_score_str = result[index_t][0]
  41. total_score_pattern = re.findall('\d+', total_score_str)
  42. total_score = int(total_score_pattern[0])
  43. ele['default_points'] = total_score
  44. new_list.append(ele)
  45. elif 'N' not in move_m_list and 'T' in move_m_list:
  46. index_t = key_words_list.index('T')
  47. total_score_str = result[index_t][0]
  48. total_score_pattern = re.findall('\d+', total_score_str)
  49. total_score = int(total_score_pattern[0])
  50. ele['default_points'] = total_score
  51. new_list.append(ele)
  52. elif 'N' in move_m_list and 'T' not in move_m_list:
  53. index_n = key_words_list.index('N')
  54. numeber_str = result[index_n][0]
  55. number_pattern = re.findall('\d+', numeber_str)
  56. title_number = int(number_pattern[0])
  57. ele['number'] = title_number
  58. new_list.append(ele)
  59. return new_list
  60. def analyse_choice_result(result, choice_m_list):
  61. key_words_list = [ele[1] for ele in result]
  62. move_m_list = [ele for ele in key_words_list if ele != 'M']
  63. o_len = [ele for ele in key_words_list if ele == 'O']
  64. if 'O' in key_words_list:
  65. index_o = [index for index, ele in enumerate(key_words_list) if ele == 'O']
  66. split_0_index = index_o
  67. split_0_index.insert(-1, len(key_words_list))
  68. split_0_index = sorted(list(set(split_0_index)))
  69. split_by_o_list = []
  70. for index, ele in enumerate(split_0_index):
  71. if index == 0:
  72. one_part = result[0: (split_0_index[index + 1]) - 1]
  73. split_by_o_list.append(one_part)
  74. elif ele == len(key_words_list):
  75. break
  76. else:
  77. one_part = result[split_0_index[index]: (split_0_index[index + 1]) - 1]
  78. split_by_o_list.append(one_part)
  79. print(split_by_o_list)
  80. number_with_value = []
  81. number_with_value1 = []
  82. for index0, ele0 in enumerate(split_by_o_list):
  83. part_key_words = [ele[1] for ele in ele0]
  84. index_oo = part_key_words.index('O')
  85. index_ss = part_key_words.index('S')
  86. contiue_number0 = ele0[index_oo]
  87. number_list = []
  88. if '-' in contiue_number0[0]:
  89. number_list = contiue_number0[0].split('-')
  90. elif '~' in contiue_number0[0]:
  91. number_list = contiue_number0[0].split('~')
  92. number_list = [int(ele) for ele in number_list]
  93. number_list_all = [i for i in range(number_list[0], number_list[1] + 1)]
  94. value = ele0[index_ss][0]
  95. nlp_number_value_dict = {}
  96. nlp_number_value_dict['number_list'] = number_list_all
  97. nlp_number_value_dict['value'] = value
  98. number_with_value1.append(nlp_number_value_dict)
  99. for ele in number_list_all:
  100. number_with_value.append({ele: value})
  101. print(number_with_value)
  102. for nlp_number in number_with_value1:
  103. number_list_nlp0 = nlp_number['number_list']
  104. value_nlp = nlp_number['value']
  105. for choice_m_box in choice_m_list:
  106. number_list_raw0 = choice_m_box['number']
  107. decide_whether_inclue = [False for c in number_list_raw0 if c not in number_list_nlp0]
  108. count_of_False = decide_whether_inclue.count(False)
  109. rows = choice_m_box['rows']
  110. if count_of_False / rows > 0.8:
  111. continue
  112. elif count_of_False / rows <= 0.4:
  113. points_list = [float(value_nlp) for i in range(0, rows)]
  114. choice_m_box['default_points'] = points_list
  115. return choice_m_list
  116. def analyse_cloze_result(result, cloze_and_cloze_s_list):
  117. new_list = []
  118. for cloze_and_cloze_s_ele in cloze_and_cloze_s_list:
  119. cloze_s_info = cloze_and_cloze_s_ele['cloze_s_info']
  120. key_words_list = [ele[1] for ele in result]
  121. move_m_list = [ele for ele in key_words_list if ele != 'M']
  122. for cloze_s_ele in cloze_s_info:
  123. if 'S' in move_m_list and 'T' in move_m_list:
  124. index_n = key_words_list.index('S')
  125. value_str = result[index_n][0]
  126. value_pattern = re.findall('\d+', value_str)
  127. value = int(value_pattern[0])
  128. cloze_s_ele['number'] = value
  129. elif 'C' not in move_m_list and 'T' in move_m_list:
  130. index_t = key_words_list.index('T')
  131. index_c = key_words_list.index('C')
  132. total_score_str = result[index_t][0]
  133. total_score_pattern = re.findall('\d+', total_score_str)
  134. total_score = int(total_score_pattern[0])
  135. value_per = float(total_score / int(index_c))
  136. cloze_s_ele['default_points'] = value_per
  137. new_list.append(cloze_s_ele)
  138. return new_list
  139. def analyse_cloze_result1(result, cloze_and_cloze_s_list):
  140. new_list = []
  141. for cloze_and_cloze_s_ele in cloze_and_cloze_s_list:
  142. cloze_s_info = cloze_and_cloze_s_ele['cloze_s_info']
  143. key_words_list = [ele[1] for ele in result]
  144. move_m_list = [ele for ele in key_words_list if ele != 'M']
  145. for cloze_s_ele in cloze_s_info:
  146. if 'S' in move_m_list and 'T' in move_m_list:
  147. index_n = key_words_list.index('S')
  148. value_str = result[index_n][0]
  149. value_pattern = re.findall('\d+', value_str)
  150. value = int(value_pattern[0])
  151. cloze_s_ele['number'] = value
  152. elif 'C' not in move_m_list and 'T' in move_m_list:
  153. index_t = key_words_list.index('T')
  154. index_c = key_words_list.index('C')
  155. total_score_str = result[index_t][0]
  156. total_score_pattern = re.findall('\d+', total_score_str)
  157. total_score = int(total_score_pattern[0])
  158. value_per = float(total_score / int(index_c))
  159. cloze_s_ele['default_points'] = value_per
  160. new_list.append(cloze_s_ele)
  161. return new_list
  162. def get_sheet_points_by_nlp(sheet_dict):
  163. # json_path = r'C:\Users\Administrator\Desktop\type_score_nlp\type_score_info\example\english\33.json'
  164. # file = open(json_path, 'r', encoding='gbk').read()
  165. # json_file = ast.literal_eval(file)
  166. regions = sheet_dict['regions']
  167. ocr_list = []
  168. new_list = []
  169. choice_m_list = [ele for ele in regions if ele['class_name'] == 'choice_m']
  170. cloze_list = [ele for ele in regions if ele['class_name'] == 'cloze']
  171. cloze_and_cloze_s_list = []
  172. for element_cloze in cloze_list:
  173. cloze_box = element_cloze['bounding_box']
  174. cloze_bbox = [cloze_box['xmin'], cloze_box['ymin'], cloze_box['xmax'], cloze_box['ymax']]
  175. cloze_s_dict = {}
  176. cloze_s_list = []
  177. for element in regions:
  178. if element['class_name'] == 'cloze_s':
  179. cloze_s_box = element['bounding_box']
  180. cloze_s_bbox = [cloze_s_box['xmin'], cloze_s_box['ymin'], cloze_s_box['xmax'], cloze_s_box['ymax']]
  181. if decide_coordinate_full_contains(cloze_bbox, cloze_s_bbox) == True:
  182. cloze_s_list.append(element)
  183. cloze_s_dict['cloze_info'] = element_cloze
  184. cloze_s_dict['cloze_s_info'] = cloze_s_list
  185. cloze_and_cloze_s_list.append(cloze_s_dict)
  186. print(cloze_and_cloze_s_list)
  187. for ele in regions:
  188. if 'type_score_ocr' in ele:
  189. ocr_list.append(ele)
  190. for index, ele in enumerate(ocr_list):
  191. ocr_content = ele['type_score_ocr']
  192. taggers = CRFPP.Tagger("-m " + './segment/sheet_resolve/model/nlp_model/crf2.model')
  193. tb = TagParse(taggers)
  194. result = tb.get_tag_val(ocr_content)
  195. if ele['class_name'] == 'cloze':
  196. print(ele)
  197. cloze_and_cloze_s_list0 = []
  198. for ele1 in cloze_and_cloze_s_list:
  199. print(ele1)
  200. ele1_cloze = [ele1['cloze_info']['bounding_box']['xmin'], ele1['cloze_info']['bounding_box']['ymin'],
  201. ele1['cloze_info']['bounding_box']['xmax'], ele1['cloze_info']['bounding_box']['ymax']]
  202. ele_cloze = [ele['bounding_box']['xmin'], ele['bounding_box']['ymin'],
  203. ele['bounding_box']['xmax'], ele['bounding_box']['ymax']]
  204. if ele1_cloze == ele_cloze:
  205. cloze_and_cloze_s_list0.append(ele1)
  206. new_list = analyse_cloze_result(result, cloze_and_cloze_s_list0)
  207. elif ele['class_name'] == 'choice':
  208. new_list = analyse_choice_result(result, choice_m_list)
  209. else:
  210. new_list = analyse_solve_solve0_result(result, ele)
  211. for index0, ele0 in enumerate(regions):
  212. for index1, ele1 in enumerate(new_list):
  213. class_name0 = ele0['class_name']
  214. bounding_box0 = ele0['bounding_box']
  215. class_name1 = ele1['class_name']
  216. bounding_box1 = ele1['bounding_box']
  217. if class_name0 == class_name1 and bounding_box0 == bounding_box1:
  218. ele0['default_points'] = ele1['default_points']
  219. # pop type_score_ocr
  220. for ele in regions:
  221. if 'type_score_ocr' in ele:
  222. ele.pop('type_score_ocr')
  223. sheet_dict.update({'regions': regions})
  224. return sheet_dict