sheet_points_by_nlp.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. # File:get_sheet_points_by_nlp.py
  2. # Author:lynn
  3. # Date:2020/5/19 18:23
  4. import ast, cv2, re
  5. from segment.sheet_resolve.analysis.sheet.tag_parse import TagParse
  6. from segment.sheet_resolve.tools.utils import crop_region
  7. try:
  8. import xml.etree.cElementTree as ET
  9. except ImportError:
  10. import xml.etree.ElementTree as ET
  11. def decide_coordinate_full_contains(coordinate1, coordinate2):
  12. xmin1 = coordinate1[0]
  13. ymin1 = coordinate1[1]
  14. xmax1 = coordinate1[2]
  15. ymax1 = coordinate1[3]
  16. mid_x = int(xmin1 + (xmax1 - xmin1)//2)
  17. mid_y = int(ymin1 + (ymax1 - ymin1)//2)
  18. xmin2 = coordinate2[0]
  19. ymin2 = coordinate2[1]
  20. xmax2 = coordinate2[2]
  21. ymax2 = coordinate2[3]
  22. if xmin1 <= xmin2 and ymin1 <= ymin2 and xmax1 >= xmax2 and ymax1 >= ymax2:
  23. return True
  24. else:
  25. return False
  26. def analyse_solve_solve0_result(result, ele):
  27. key_words = ['T', 'C', 'S', 'N', 'O', 'E']
  28. key_words_list = [ele[1] for ele in result]
  29. move_m_list = [ele for ele in key_words_list if ele != 'M']
  30. new_list = []
  31. if 'O' not in key_words_list:
  32. if 'N' in move_m_list and 'T' in move_m_list:
  33. index_n = key_words_list.index('N')
  34. index_t = key_words_list.index('T')
  35. numeber_str = result[index_n][0]
  36. number_pattern = re.findall('\d+', numeber_str)
  37. title_number = int(number_pattern[0])
  38. ele['number'] = title_number
  39. total_score_str = result[index_t][0]
  40. total_score_pattern = re.findall('\d+', total_score_str)
  41. total_score = int(total_score_pattern[0])
  42. ele['default_points'] = total_score
  43. new_list.append(ele)
  44. elif 'N' not in move_m_list and 'T' in move_m_list:
  45. index_t = key_words_list.index('T')
  46. total_score_str = result[index_t][0]
  47. total_score_pattern = re.findall('\d+', total_score_str)
  48. total_score = int(total_score_pattern[0])
  49. ele['default_points'] = total_score
  50. new_list.append(ele)
  51. elif 'N' in move_m_list and 'T' not in move_m_list:
  52. index_n = key_words_list.index('N')
  53. numeber_str = result[index_n][0]
  54. number_pattern = re.findall('\d+', numeber_str)
  55. title_number = int(number_pattern[0])
  56. ele['number'] = title_number
  57. new_list.append(ele)
  58. return new_list
  59. def analyse_choice_result(result, choice_m_list):
  60. key_words_list = [ele[1] for ele in result]
  61. move_m_list = [ele for ele in key_words_list if ele != 'M']
  62. o_len = [ele for ele in key_words_list if ele == 'O']
  63. if 'O' in key_words_list:
  64. index_o = [index for index, ele in enumerate(key_words_list) if ele == 'O']
  65. split_0_index = index_o
  66. split_0_index.insert(-1, len(key_words_list))
  67. split_0_index = sorted(list(set(split_0_index)))
  68. split_by_o_list = []
  69. for index, ele in enumerate(split_0_index):
  70. if index == 0:
  71. one_part = result[0: (split_0_index[index + 1]) - 1]
  72. split_by_o_list.append(one_part)
  73. elif ele == len(key_words_list):
  74. break
  75. else:
  76. one_part = result[split_0_index[index]: (split_0_index[index + 1]) - 1]
  77. split_by_o_list.append(one_part)
  78. print(split_by_o_list)
  79. number_with_value = []
  80. number_with_value1 = []
  81. for index0, ele0 in enumerate(split_by_o_list):
  82. part_key_words = [ele[1] for ele in ele0]
  83. index_oo = part_key_words.index('O')
  84. index_ss = part_key_words.index('S')
  85. contiue_number0 = ele0[index_oo]
  86. number_list = []
  87. if '-' in contiue_number0[0]:
  88. number_list = contiue_number0[0].split('-')
  89. elif '~' in contiue_number0[0]:
  90. number_list = contiue_number0[0].split('~')
  91. number_list = [int(ele) for ele in number_list]
  92. number_list_all = [i for i in range(number_list[0], number_list[1] + 1)]
  93. value = ele0[index_ss][0]
  94. nlp_number_value_dict = {}
  95. nlp_number_value_dict['number_list'] = number_list_all
  96. nlp_number_value_dict['value'] = value
  97. number_with_value1.append(nlp_number_value_dict)
  98. for ele in number_list_all:
  99. number_with_value.append({ele: value})
  100. print(number_with_value)
  101. for nlp_number in number_with_value1:
  102. number_list_nlp0 = nlp_number['number_list']
  103. value_nlp = nlp_number['value']
  104. for choice_m_box in choice_m_list:
  105. number_list_raw0 = choice_m_box['number']
  106. decide_whether_inclue = [False for c in number_list_raw0 if c not in number_list_nlp0]
  107. count_of_False = decide_whether_inclue.count(False)
  108. rows = choice_m_box['rows']
  109. if count_of_False / rows > 0.8:
  110. continue
  111. elif count_of_False / rows <= 0.4:
  112. points_list = [float(value_nlp) for i in range(0, rows)]
  113. choice_m_box['default_points'] = points_list
  114. return choice_m_list
  115. def analyse_cloze_result(result, cloze_and_cloze_s_list):
  116. new_list = []
  117. for cloze_and_cloze_s_ele in cloze_and_cloze_s_list:
  118. cloze_s_info = cloze_and_cloze_s_ele['cloze_s_info']
  119. key_words_list = [ele[1] for ele in result]
  120. move_m_list = [ele for ele in key_words_list if ele != 'M']
  121. for cloze_s_ele in cloze_s_info:
  122. if 'S' in move_m_list and 'T' in move_m_list:
  123. index_n = key_words_list.index('S')
  124. value_str = result[index_n][0]
  125. value_pattern = re.findall('\d+', value_str)
  126. value = int(value_pattern[0])
  127. cloze_s_ele['number'] = value
  128. elif 'C' not in move_m_list and 'T' in move_m_list:
  129. index_t = key_words_list.index('T')
  130. index_c = key_words_list.index('C')
  131. total_score_str = result[index_t][0]
  132. total_score_pattern = re.findall('\d+', total_score_str)
  133. total_score = int(total_score_pattern[0])
  134. value_per = float(total_score / int(index_c))
  135. cloze_s_ele['default_points'] = value_per
  136. new_list.append(cloze_s_ele)
  137. return new_list
  138. def analyse_cloze_result1(result, cloze_and_cloze_s_list):
  139. new_list = []
  140. for cloze_and_cloze_s_ele in cloze_and_cloze_s_list:
  141. cloze_s_info = cloze_and_cloze_s_ele['cloze_s_info']
  142. key_words_list = [ele[1] for ele in result]
  143. move_m_list = [ele for ele in key_words_list if ele != 'M']
  144. for cloze_s_ele in cloze_s_info:
  145. if 'S' in move_m_list and 'T' in move_m_list:
  146. index_n = key_words_list.index('S')
  147. value_str = result[index_n][0]
  148. value_pattern = re.findall('\d+', value_str)
  149. value = int(value_pattern[0])
  150. cloze_s_ele['number'] = value
  151. elif 'C' not in move_m_list and 'T' in move_m_list:
  152. index_t = key_words_list.index('T')
  153. index_c = key_words_list.index('C')
  154. total_score_str = result[index_t][0]
  155. total_score_pattern = re.findall('\d+', total_score_str)
  156. total_score = int(total_score_pattern[0])
  157. value_per = float(total_score / int(index_c))
  158. cloze_s_ele['default_points'] = value_per
  159. new_list.append(cloze_s_ele)
  160. return new_list
  161. def get_sheet_points_by_nlp(sheet_dict):
  162. import CRFPP
  163. # json_path = r'C:\Users\Administrator\Desktop\type_score_nlp\type_score_info\example\english\33.json'
  164. # file = open(json_path, 'r', encoding='gbk').read()
  165. # json_file = ast.literal_eval(file)
  166. regions = sheet_dict['regions']
  167. ocr_list = []
  168. new_list = []
  169. choice_m_list = [ele for ele in regions if ele['class_name'] == 'choice_m']
  170. cloze_list = [ele for ele in regions if ele['class_name'] == 'cloze']
  171. cloze_and_cloze_s_list = []
  172. for element_cloze in cloze_list:
  173. cloze_box = element_cloze['bounding_box']
  174. cloze_bbox = [cloze_box['xmin'], cloze_box['ymin'], cloze_box['xmax'], cloze_box['ymax']]
  175. cloze_s_dict = {}
  176. cloze_s_list = []
  177. for element in regions:
  178. if element['class_name'] == 'cloze_s':
  179. cloze_s_box = element['bounding_box']
  180. cloze_s_bbox = [cloze_s_box['xmin'], cloze_s_box['ymin'], cloze_s_box['xmax'], cloze_s_box['ymax']]
  181. if decide_coordinate_full_contains(cloze_bbox, cloze_s_bbox) == True:
  182. cloze_s_list.append(element)
  183. cloze_s_dict['cloze_info'] = element_cloze
  184. cloze_s_dict['cloze_s_info'] = cloze_s_list
  185. cloze_and_cloze_s_list.append(cloze_s_dict)
  186. print(cloze_and_cloze_s_list)
  187. for ele in regions:
  188. if 'type_score_ocr' in ele:
  189. ocr_list.append(ele)
  190. for index, ele in enumerate(ocr_list):
  191. ocr_content = ele['type_score_ocr']
  192. taggers = CRFPP.Tagger("-m " + './segment/sheet_resolve/model/nlp_model/crf2.model')
  193. tb = TagParse(taggers)
  194. result = tb.get_tag_val(ocr_content)
  195. if ele['class_name'] == 'cloze':
  196. print(ele)
  197. cloze_and_cloze_s_list0 = []
  198. for ele1 in cloze_and_cloze_s_list:
  199. print(ele1)
  200. ele1_cloze = [ele1['cloze_info']['bounding_box']['xmin'], ele1['cloze_info']['bounding_box']['ymin'],
  201. ele1['cloze_info']['bounding_box']['xmax'], ele1['cloze_info']['bounding_box']['ymax']]
  202. ele_cloze = [ele['bounding_box']['xmin'], ele['bounding_box']['ymin'],
  203. ele['bounding_box']['xmax'], ele['bounding_box']['ymax']]
  204. if ele1_cloze == ele_cloze:
  205. cloze_and_cloze_s_list0.append(ele1)
  206. new_list = analyse_cloze_result(result, cloze_and_cloze_s_list0)
  207. elif ele['class_name'] == 'choice':
  208. new_list = analyse_choice_result(result, choice_m_list)
  209. else:
  210. new_list = analyse_solve_solve0_result(result, ele)
  211. for index0, ele0 in enumerate(regions):
  212. for index1, ele1 in enumerate(new_list):
  213. class_name0 = ele0['class_name']
  214. bounding_box0 = ele0['bounding_box']
  215. class_name1 = ele1['class_name']
  216. bounding_box1 = ele1['bounding_box']
  217. if class_name0 == class_name1 and bounding_box0 == bounding_box1:
  218. ele0['default_points'] = ele1['default_points']
  219. # pop type_score_ocr
  220. for ele in regions:
  221. if 'type_score_ocr' in ele:
  222. ele.pop('type_score_ocr')
  223. sheet_dict.update({'regions': regions})
  224. return sheet_dict