123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260 |
- # File:get_sheet_points_by_nlp.py
- # Author:lynn
- # Date:2020/5/19 18:23
- import ast, cv2, re
- from segment.sheet_resolve.analysis.sheet.tag_parse import TagParse
- import CRFPP
- from segment.sheet_resolve.tools.utils import crop_region
- try:
- import xml.etree.cElementTree as ET
- except ImportError:
- import xml.etree.ElementTree as ET
- def decide_coordinate_full_contains(coordinate1, coordinate2):
- xmin1 = coordinate1[0]
- ymin1 = coordinate1[1]
- xmax1 = coordinate1[2]
- ymax1 = coordinate1[3]
- mid_x = int(xmin1 + (xmax1 - xmin1)//2)
- mid_y = int(ymin1 + (ymax1 - ymin1)//2)
- xmin2 = coordinate2[0]
- ymin2 = coordinate2[1]
- xmax2 = coordinate2[2]
- ymax2 = coordinate2[3]
- if xmin1 <= xmin2 and ymin1 <= ymin2 and xmax1 >= xmax2 and ymax1 >= ymax2:
- return True
- else:
- return False
- def analyse_solve_solve0_result(result, ele):
- key_words = ['T', 'C', 'S', 'N', 'O', 'E']
- key_words_list = [ele[1] for ele in result]
- move_m_list = [ele for ele in key_words_list if ele != 'M']
- new_list = []
- if 'O' not in key_words_list:
- if 'N' in move_m_list and 'T' in move_m_list:
- index_n = key_words_list.index('N')
- index_t = key_words_list.index('T')
- numeber_str = result[index_n][0]
- number_pattern = re.findall('\d+', numeber_str)
- title_number = int(number_pattern[0])
- ele['number'] = title_number
- total_score_str = result[index_t][0]
- total_score_pattern = re.findall('\d+', total_score_str)
- total_score = int(total_score_pattern[0])
- ele['default_points'] = total_score
- new_list.append(ele)
- elif 'N' not in move_m_list and 'T' in move_m_list:
- index_t = key_words_list.index('T')
- total_score_str = result[index_t][0]
- total_score_pattern = re.findall('\d+', total_score_str)
- total_score = int(total_score_pattern[0])
- ele['default_points'] = total_score
- new_list.append(ele)
- elif 'N' in move_m_list and 'T' not in move_m_list:
- index_n = key_words_list.index('N')
- numeber_str = result[index_n][0]
- number_pattern = re.findall('\d+', numeber_str)
- title_number = int(number_pattern[0])
- ele['number'] = title_number
- new_list.append(ele)
- return new_list
- def analyse_choice_result(result, choice_m_list):
- key_words_list = [ele[1] for ele in result]
- move_m_list = [ele for ele in key_words_list if ele != 'M']
- o_len = [ele for ele in key_words_list if ele == 'O']
- if 'O' in key_words_list:
- index_o = [index for index, ele in enumerate(key_words_list) if ele == 'O']
- split_0_index = index_o
- split_0_index.insert(-1, len(key_words_list))
- split_0_index = sorted(list(set(split_0_index)))
- split_by_o_list = []
- for index, ele in enumerate(split_0_index):
- if index == 0:
- one_part = result[0: (split_0_index[index + 1]) - 1]
- split_by_o_list.append(one_part)
- elif ele == len(key_words_list):
- break
- else:
- one_part = result[split_0_index[index]: (split_0_index[index + 1]) - 1]
- split_by_o_list.append(one_part)
- print(split_by_o_list)
- number_with_value = []
- number_with_value1 = []
- for index0, ele0 in enumerate(split_by_o_list):
- part_key_words = [ele[1] for ele in ele0]
- index_oo = part_key_words.index('O')
- index_ss = part_key_words.index('S')
- contiue_number0 = ele0[index_oo]
- number_list = []
- if '-' in contiue_number0[0]:
- number_list = contiue_number0[0].split('-')
- elif '~' in contiue_number0[0]:
- number_list = contiue_number0[0].split('~')
- number_list = [int(ele) for ele in number_list]
- number_list_all = [i for i in range(number_list[0], number_list[1] + 1)]
- value = ele0[index_ss][0]
- nlp_number_value_dict = {}
- nlp_number_value_dict['number_list'] = number_list_all
- nlp_number_value_dict['value'] = value
- number_with_value1.append(nlp_number_value_dict)
- for ele in number_list_all:
- number_with_value.append({ele: value})
- print(number_with_value)
- for nlp_number in number_with_value1:
- number_list_nlp0 = nlp_number['number_list']
- value_nlp = nlp_number['value']
- for choice_m_box in choice_m_list:
- number_list_raw0 = choice_m_box['number']
- decide_whether_inclue = [False for c in number_list_raw0 if c not in number_list_nlp0]
- count_of_False = decide_whether_inclue.count(False)
- rows = choice_m_box['rows']
- if count_of_False / rows > 0.8:
- continue
- elif count_of_False / rows <= 0.4:
- points_list = [float(value_nlp) for i in range(0, rows)]
- choice_m_box['default_points'] = points_list
- return choice_m_list
- def analyse_cloze_result(result, cloze_and_cloze_s_list):
- new_list = []
- for cloze_and_cloze_s_ele in cloze_and_cloze_s_list:
- cloze_s_info = cloze_and_cloze_s_ele['cloze_s_info']
- key_words_list = [ele[1] for ele in result]
- move_m_list = [ele for ele in key_words_list if ele != 'M']
- for cloze_s_ele in cloze_s_info:
- if 'S' in move_m_list and 'T' in move_m_list:
- index_n = key_words_list.index('S')
- value_str = result[index_n][0]
- value_pattern = re.findall('\d+', value_str)
- value = int(value_pattern[0])
- cloze_s_ele['number'] = value
- elif 'C' not in move_m_list and 'T' in move_m_list:
- index_t = key_words_list.index('T')
- index_c = key_words_list.index('C')
- total_score_str = result[index_t][0]
- total_score_pattern = re.findall('\d+', total_score_str)
- total_score = int(total_score_pattern[0])
- value_per = float(total_score / int(index_c))
- cloze_s_ele['default_points'] = value_per
- new_list.append(cloze_s_ele)
- return new_list
- def analyse_cloze_result1(result, cloze_and_cloze_s_list):
- new_list = []
- for cloze_and_cloze_s_ele in cloze_and_cloze_s_list:
- cloze_s_info = cloze_and_cloze_s_ele['cloze_s_info']
- key_words_list = [ele[1] for ele in result]
- move_m_list = [ele for ele in key_words_list if ele != 'M']
- for cloze_s_ele in cloze_s_info:
- if 'S' in move_m_list and 'T' in move_m_list:
- index_n = key_words_list.index('S')
- value_str = result[index_n][0]
- value_pattern = re.findall('\d+', value_str)
- value = int(value_pattern[0])
- cloze_s_ele['number'] = value
- elif 'C' not in move_m_list and 'T' in move_m_list:
- index_t = key_words_list.index('T')
- index_c = key_words_list.index('C')
- total_score_str = result[index_t][0]
- total_score_pattern = re.findall('\d+', total_score_str)
- total_score = int(total_score_pattern[0])
- value_per = float(total_score / int(index_c))
- cloze_s_ele['default_points'] = value_per
- new_list.append(cloze_s_ele)
- return new_list
- def get_sheet_points_by_nlp(sheet_dict):
- # json_path = r'C:\Users\Administrator\Desktop\type_score_nlp\type_score_info\example\english\33.json'
- # file = open(json_path, 'r', encoding='gbk').read()
- # json_file = ast.literal_eval(file)
- regions = sheet_dict['regions']
- ocr_list = []
- new_list = []
- choice_m_list = [ele for ele in regions if ele['class_name'] == 'choice_m']
- cloze_list = [ele for ele in regions if ele['class_name'] == 'cloze']
- cloze_and_cloze_s_list = []
- for element_cloze in cloze_list:
- cloze_box = element_cloze['bounding_box']
- cloze_bbox = [cloze_box['xmin'], cloze_box['ymin'], cloze_box['xmax'], cloze_box['ymax']]
- cloze_s_dict = {}
- cloze_s_list = []
- for element in regions:
- if element['class_name'] == 'cloze_s':
- cloze_s_box = element['bounding_box']
- cloze_s_bbox = [cloze_s_box['xmin'], cloze_s_box['ymin'], cloze_s_box['xmax'], cloze_s_box['ymax']]
- if decide_coordinate_full_contains(cloze_bbox, cloze_s_bbox) == True:
- cloze_s_list.append(element)
- cloze_s_dict['cloze_info'] = element_cloze
- cloze_s_dict['cloze_s_info'] = cloze_s_list
- cloze_and_cloze_s_list.append(cloze_s_dict)
- print(cloze_and_cloze_s_list)
- for ele in regions:
- if 'type_score_ocr' in ele:
- ocr_list.append(ele)
- for index, ele in enumerate(ocr_list):
- ocr_content = ele['type_score_ocr']
- taggers = CRFPP.Tagger("-m " + './segment/sheet_resolve/model/nlp_model/crf2.model')
- tb = TagParse(taggers)
- result = tb.get_tag_val(ocr_content)
- if ele['class_name'] == 'cloze':
- print(ele)
- cloze_and_cloze_s_list0 = []
- for ele1 in cloze_and_cloze_s_list:
- print(ele1)
- ele1_cloze = [ele1['cloze_info']['bounding_box']['xmin'], ele1['cloze_info']['bounding_box']['ymin'],
- ele1['cloze_info']['bounding_box']['xmax'], ele1['cloze_info']['bounding_box']['ymax']]
- ele_cloze = [ele['bounding_box']['xmin'], ele['bounding_box']['ymin'],
- ele['bounding_box']['xmax'], ele['bounding_box']['ymax']]
- if ele1_cloze == ele_cloze:
- cloze_and_cloze_s_list0.append(ele1)
- new_list = analyse_cloze_result(result, cloze_and_cloze_s_list0)
- elif ele['class_name'] == 'choice':
- new_list = analyse_choice_result(result, choice_m_list)
- else:
- new_list = analyse_solve_solve0_result(result, ele)
- for index0, ele0 in enumerate(regions):
- for index1, ele1 in enumerate(new_list):
- class_name0 = ele0['class_name']
- bounding_box0 = ele0['bounding_box']
- class_name1 = ele1['class_name']
- bounding_box1 = ele1['bounding_box']
- if class_name0 == class_name1 and bounding_box0 == bounding_box1:
- ele0['default_points'] = ele1['default_points']
- # pop type_score_ocr
- for ele in regions:
- if 'type_score_ocr' in ele:
- ele.pop('type_score_ocr')
- sheet_dict.update({'regions': regions})
- return sheet_dict
|